Skip to content

Work around sub_group load/store issues #1485

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions dpctl/tensor/libtensor/include/kernels/alignment.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//
// Data Parallel Control (dpctl)
//
// Copyright 2020-2023 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

#pragma once

#include <cstddef>
#include <cstdint>

namespace dpctl
{
namespace tensor
{
namespace kernels
{
namespace alignment_utils
{

static constexpr size_t required_alignment = 64;

template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
{
return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
}

template <typename KernelName> class disabled_sg_loadstore_wrapper_krn;

} // end of namespace alignment_utils
} // end of namespace kernels
} // end of namespace tensor
} // end of namespace dpctl
42 changes: 36 additions & 6 deletions dpctl/tensor/libtensor/include/kernels/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <pybind11/pybind11.h>
#include <type_traits>

#include "kernels/alignment.hpp"
#include "utils/math_utils.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_dispatch.hpp"
Expand All @@ -51,6 +52,11 @@ namespace td_ns = dpctl::tensor::type_dispatch;

using namespace dpctl::tensor::offset_utils;

using dpctl::tensor::kernels::alignment_utils::
disabled_sg_loadstore_wrapper_krn;
using dpctl::tensor::kernels::alignment_utils::is_aligned;
using dpctl::tensor::kernels::alignment_utils::required_alignment;

template <typename T> T clip(const T &x, const T &min, const T &max)
{
using dpctl::tensor::type_utils::is_complex;
Expand All @@ -73,7 +79,11 @@ template <typename T> T clip(const T &x, const T &min, const T &max)
}
}

template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
template <typename T,
int vec_sz = 4,
int n_vecs = 2,
bool enable_sg_loadstore = true>
class ClipContigFunctor
{
private:
size_t nelems = 0;
Expand All @@ -96,7 +106,7 @@ template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
void operator()(sycl::nd_item<1> ndit) const
{
using dpctl::tensor::type_utils::is_complex;
if constexpr (is_complex<T>::value) {
if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
size_t base = ndit.get_global_linear_id();

Expand Down Expand Up @@ -195,10 +205,30 @@ sycl::event clip_contig_impl(sycl::queue &q,
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

cgh.parallel_for<clip_contig_kernel<T, vec_sz, n_vecs>>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs>(nelems, x_tp, min_tp, max_tp,
dst_tp));
if (is_aligned<required_alignment>(x_cp) &&
is_aligned<required_alignment>(min_cp) &&
is_aligned<required_alignment>(max_cp) &&
is_aligned<required_alignment>(dst_cp))
{
constexpr bool enable_sg_loadstore = true;
using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>(
nelems, x_tp, min_tp, max_tp, dst_tp));
}
else {
constexpr bool disable_sg_loadstore = false;
using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
using KernelName =
disabled_sg_loadstore_wrapper_krn<InnerKernelName>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>(
nelems, x_tp, min_tp, max_tp, dst_tp));
}
});

return clip_ev;
Expand Down
43 changes: 37 additions & 6 deletions dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <sycl/sycl.hpp>
#include <type_traits>

#include "kernels/alignment.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_utils.hpp"

Expand All @@ -44,6 +45,11 @@ namespace copy_and_cast
namespace py = pybind11;
using namespace dpctl::tensor::offset_utils;

using dpctl::tensor::kernels::alignment_utils::
disabled_sg_loadstore_wrapper_krn;
using dpctl::tensor::kernels::alignment_utils::is_aligned;
using dpctl::tensor::kernels::alignment_utils::required_alignment;

template <typename srcT, typename dstT, typename IndexerT>
class copy_cast_generic_kernel;

Expand Down Expand Up @@ -200,7 +206,8 @@ template <typename srcT,
typename dstT,
typename CastFnT,
int vec_sz = 4,
int n_vecs = 2>
int n_vecs = 2,
bool enable_sg_loadstore = true>
class ContigCopyFunctor
{
private:
Expand All @@ -219,7 +226,9 @@ class ContigCopyFunctor
CastFnT fn{};

using dpctl::tensor::type_utils::is_complex;
if constexpr (is_complex<srcT>::value || is_complex<dstT>::value) {
if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
is_complex<dstT>::value)
{
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
size_t base = ndit.get_global_linear_id();

Expand Down Expand Up @@ -326,10 +335,32 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

cgh.parallel_for<copy_cast_contig_kernel<srcTy, dstTy, n_vecs, vec_sz>>(
sycl::nd_range<1>(gws_range, lws_range),
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
n_vecs>(nelems, src_tp, dst_tp));
if (is_aligned<required_alignment>(src_cp) &&
is_aligned<required_alignment>(dst_cp))
{
constexpr bool enable_sg_loadstore = true;
using KernelName =
copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
n_vecs, enable_sg_loadstore>(nelems, src_tp,
dst_tp));
}
else {
constexpr bool disable_sg_loadstore = false;
using InnerKernelName =
copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
using KernelName =
disabled_sg_loadstore_wrapper_krn<InnerKernelName>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
n_vecs, disable_sg_loadstore>(nelems, src_tp,
dst_tp));
}
});

return copy_and_cast_ev;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,15 @@ template <typename argT, typename resT> struct AbsFunctor
template <typename argT,
typename resT = argT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AbsContigFunctor = elementwise_common::
UnaryContigFunctor<argT, resT, AbsFunctor<argT, resT>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AbsContigFunctor =
elementwise_common::UnaryContigFunctor<argT,
resT,
AbsFunctor<argT, resT>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename T> struct AbsOutputType
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AcosFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AcosContigFunctor = elementwise_common::
UnaryContigFunctor<argTy, resTy, AcosFunctor<argTy, resTy>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AcosContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AcosFunctor<argTy, resTy>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AcosStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,15 @@ template <typename argT, typename resT> struct AcoshFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AcoshContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AcoshFunctor<argTy, resTy>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AcoshStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,16 @@ template <typename argT1,
typename argT2,
typename resT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AddContigFunctor =
elementwise_common::BinaryContigFunctor<argT1,
argT2,
resT,
AddFunctor<argT1, argT2, resT>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argT1, typename argT2, typename resT, typename IndexerT>
using AddStridedFunctor =
Expand Down Expand Up @@ -425,13 +427,15 @@ template <typename argT, typename resT> struct AddInplaceFunctor
template <typename argT,
typename resT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
argT,
resT,
AddInplaceFunctor<argT, resT>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argT, typename resT, typename IndexerT>
using AddInplaceStridedFunctor =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,15 @@ template <typename argT, typename resT> struct AngleFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AngleContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AngleFunctor<argTy, resTy>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AngleStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,15 @@ template <typename argT, typename resT> struct AsinFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AsinContigFunctor = elementwise_common::
UnaryContigFunctor<argTy, resTy, AsinFunctor<argTy, resTy>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AsinContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AsinFunctor<argTy, resTy>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AsinStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,15 @@ template <typename argT, typename resT> struct AsinhFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AsinhContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AsinhFunctor<argTy, resTy>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AsinhStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AtanFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AtanContigFunctor = elementwise_common::
UnaryContigFunctor<argTy, resTy, AtanFunctor<argTy, resTy>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AtanContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AtanFunctor<argTy, resTy>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AtanStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,16 @@ template <typename argT1,
typename argT2,
typename resT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using Atan2ContigFunctor =
elementwise_common::BinaryContigFunctor<argT1,
argT2,
resT,
Atan2Functor<argT1, argT2, resT>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argT1, typename argT2, typename resT, typename IndexerT>
using Atan2StridedFunctor =
Expand Down
Loading