Skip to content

Commit a4369ac

Browse files
Merge pull request #1485 from IntelPython/work-around-sg-loadstore-issues
2 parents 0b63d4f + 86153f3 commit a4369ac

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+862
-260
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
//
2+
// Data Parallel Control (dpctl)
3+
//
4+
// Copyright 2020-2023 Intel Corporation
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
19+
#pragma once
20+
21+
#include <cstddef>
22+
#include <cstdint>
23+
24+
namespace dpctl
25+
{
26+
namespace tensor
27+
{
28+
namespace kernels
29+
{
30+
namespace alignment_utils
31+
{
32+
33+
static constexpr size_t required_alignment = 64;
34+
35+
template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
36+
{
37+
return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
38+
}
39+
40+
template <typename KernelName> class disabled_sg_loadstore_wrapper_krn;
41+
42+
} // end of namespace alignment_utils
43+
} // end of namespace kernels
44+
} // end of namespace tensor
45+
} // end of namespace dpctl

dpctl/tensor/libtensor/include/kernels/clip.hpp

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <pybind11/pybind11.h>
3333
#include <type_traits>
3434

35+
#include "kernels/alignment.hpp"
3536
#include "utils/math_utils.hpp"
3637
#include "utils/offset_utils.hpp"
3738
#include "utils/type_dispatch.hpp"
@@ -51,6 +52,11 @@ namespace td_ns = dpctl::tensor::type_dispatch;
5152

5253
using namespace dpctl::tensor::offset_utils;
5354

55+
using dpctl::tensor::kernels::alignment_utils::
56+
disabled_sg_loadstore_wrapper_krn;
57+
using dpctl::tensor::kernels::alignment_utils::is_aligned;
58+
using dpctl::tensor::kernels::alignment_utils::required_alignment;
59+
5460
template <typename T> T clip(const T &x, const T &min, const T &max)
5561
{
5662
using dpctl::tensor::type_utils::is_complex;
@@ -73,7 +79,11 @@ template <typename T> T clip(const T &x, const T &min, const T &max)
7379
}
7480
}
7581

76-
template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
82+
template <typename T,
83+
int vec_sz = 4,
84+
int n_vecs = 2,
85+
bool enable_sg_loadstore = true>
86+
class ClipContigFunctor
7787
{
7888
private:
7989
size_t nelems = 0;
@@ -96,7 +106,7 @@ template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
96106
void operator()(sycl::nd_item<1> ndit) const
97107
{
98108
using dpctl::tensor::type_utils::is_complex;
99-
if constexpr (is_complex<T>::value) {
109+
if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
100110
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
101111
size_t base = ndit.get_global_linear_id();
102112

@@ -195,10 +205,30 @@ sycl::event clip_contig_impl(sycl::queue &q,
195205
const auto gws_range = sycl::range<1>(n_groups * lws);
196206
const auto lws_range = sycl::range<1>(lws);
197207

198-
cgh.parallel_for<clip_contig_kernel<T, vec_sz, n_vecs>>(
199-
sycl::nd_range<1>(gws_range, lws_range),
200-
ClipContigFunctor<T, vec_sz, n_vecs>(nelems, x_tp, min_tp, max_tp,
201-
dst_tp));
208+
if (is_aligned<required_alignment>(x_cp) &&
209+
is_aligned<required_alignment>(min_cp) &&
210+
is_aligned<required_alignment>(max_cp) &&
211+
is_aligned<required_alignment>(dst_cp))
212+
{
213+
constexpr bool enable_sg_loadstore = true;
214+
using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
215+
216+
cgh.parallel_for<KernelName>(
217+
sycl::nd_range<1>(gws_range, lws_range),
218+
ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>(
219+
nelems, x_tp, min_tp, max_tp, dst_tp));
220+
}
221+
else {
222+
constexpr bool disable_sg_loadstore = false;
223+
using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
224+
using KernelName =
225+
disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
226+
227+
cgh.parallel_for<KernelName>(
228+
sycl::nd_range<1>(gws_range, lws_range),
229+
ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>(
230+
nelems, x_tp, min_tp, max_tp, dst_tp));
231+
}
202232
});
203233

204234
return clip_ev;

dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <sycl/sycl.hpp>
3030
#include <type_traits>
3131

32+
#include "kernels/alignment.hpp"
3233
#include "utils/offset_utils.hpp"
3334
#include "utils/type_utils.hpp"
3435

@@ -44,6 +45,11 @@ namespace copy_and_cast
4445
namespace py = pybind11;
4546
using namespace dpctl::tensor::offset_utils;
4647

48+
using dpctl::tensor::kernels::alignment_utils::
49+
disabled_sg_loadstore_wrapper_krn;
50+
using dpctl::tensor::kernels::alignment_utils::is_aligned;
51+
using dpctl::tensor::kernels::alignment_utils::required_alignment;
52+
4753
template <typename srcT, typename dstT, typename IndexerT>
4854
class copy_cast_generic_kernel;
4955

@@ -200,7 +206,8 @@ template <typename srcT,
200206
typename dstT,
201207
typename CastFnT,
202208
int vec_sz = 4,
203-
int n_vecs = 2>
209+
int n_vecs = 2,
210+
bool enable_sg_loadstore = true>
204211
class ContigCopyFunctor
205212
{
206213
private:
@@ -219,7 +226,9 @@ class ContigCopyFunctor
219226
CastFnT fn{};
220227

221228
using dpctl::tensor::type_utils::is_complex;
222-
if constexpr (is_complex<srcT>::value || is_complex<dstT>::value) {
229+
if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
230+
is_complex<dstT>::value)
231+
{
223232
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
224233
size_t base = ndit.get_global_linear_id();
225234

@@ -326,10 +335,32 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
326335
const auto gws_range = sycl::range<1>(n_groups * lws);
327336
const auto lws_range = sycl::range<1>(lws);
328337

329-
cgh.parallel_for<copy_cast_contig_kernel<srcTy, dstTy, n_vecs, vec_sz>>(
330-
sycl::nd_range<1>(gws_range, lws_range),
331-
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
332-
n_vecs>(nelems, src_tp, dst_tp));
338+
if (is_aligned<required_alignment>(src_cp) &&
339+
is_aligned<required_alignment>(dst_cp))
340+
{
341+
constexpr bool enable_sg_loadstore = true;
342+
using KernelName =
343+
copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
344+
345+
cgh.parallel_for<KernelName>(
346+
sycl::nd_range<1>(gws_range, lws_range),
347+
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
348+
n_vecs, enable_sg_loadstore>(nelems, src_tp,
349+
dst_tp));
350+
}
351+
else {
352+
constexpr bool disable_sg_loadstore = false;
353+
using InnerKernelName =
354+
copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
355+
using KernelName =
356+
disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
357+
358+
cgh.parallel_for<KernelName>(
359+
sycl::nd_range<1>(gws_range, lws_range),
360+
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
361+
n_vecs, disable_sg_loadstore>(nelems, src_tp,
362+
dst_tp));
363+
}
333364
});
334365

335366
return copy_and_cast_ev;

dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,15 @@ template <typename argT, typename resT> struct AbsFunctor
132132
template <typename argT,
133133
typename resT = argT,
134134
unsigned int vec_sz = 4,
135-
unsigned int n_vecs = 2>
136-
using AbsContigFunctor = elementwise_common::
137-
UnaryContigFunctor<argT, resT, AbsFunctor<argT, resT>, vec_sz, n_vecs>;
135+
unsigned int n_vecs = 2,
136+
bool enable_sg_loadstore = true>
137+
using AbsContigFunctor =
138+
elementwise_common::UnaryContigFunctor<argT,
139+
resT,
140+
AbsFunctor<argT, resT>,
141+
vec_sz,
142+
n_vecs,
143+
enable_sg_loadstore>;
138144

139145
template <typename T> struct AbsOutputType
140146
{

dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AcosFunctor
145145
template <typename argTy,
146146
typename resTy = argTy,
147147
unsigned int vec_sz = 4,
148-
unsigned int n_vecs = 2>
149-
using AcosContigFunctor = elementwise_common::
150-
UnaryContigFunctor<argTy, resTy, AcosFunctor<argTy, resTy>, vec_sz, n_vecs>;
148+
unsigned int n_vecs = 2,
149+
bool enable_sg_loadstore = true>
150+
using AcosContigFunctor =
151+
elementwise_common::UnaryContigFunctor<argTy,
152+
resTy,
153+
AcosFunctor<argTy, resTy>,
154+
vec_sz,
155+
n_vecs,
156+
enable_sg_loadstore>;
151157

152158
template <typename argTy, typename resTy, typename IndexerT>
153159
using AcosStridedFunctor = elementwise_common::

dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,15 @@ template <typename argT, typename resT> struct AcoshFunctor
167167
template <typename argTy,
168168
typename resTy = argTy,
169169
unsigned int vec_sz = 4,
170-
unsigned int n_vecs = 2>
170+
unsigned int n_vecs = 2,
171+
bool enable_sg_loadstore = true>
171172
using AcoshContigFunctor =
172173
elementwise_common::UnaryContigFunctor<argTy,
173174
resTy,
174175
AcoshFunctor<argTy, resTy>,
175176
vec_sz,
176-
n_vecs>;
177+
n_vecs,
178+
enable_sg_loadstore>;
177179

178180
template <typename argTy, typename resTy, typename IndexerT>
179181
using AcoshStridedFunctor = elementwise_common::

dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,14 +123,16 @@ template <typename argT1,
123123
typename argT2,
124124
typename resT,
125125
unsigned int vec_sz = 4,
126-
unsigned int n_vecs = 2>
126+
unsigned int n_vecs = 2,
127+
bool enable_sg_loadstore = true>
127128
using AddContigFunctor =
128129
elementwise_common::BinaryContigFunctor<argT1,
129130
argT2,
130131
resT,
131132
AddFunctor<argT1, argT2, resT>,
132133
vec_sz,
133-
n_vecs>;
134+
n_vecs,
135+
enable_sg_loadstore>;
134136

135137
template <typename argT1, typename argT2, typename resT, typename IndexerT>
136138
using AddStridedFunctor =
@@ -425,13 +427,15 @@ template <typename argT, typename resT> struct AddInplaceFunctor
425427
template <typename argT,
426428
typename resT,
427429
unsigned int vec_sz = 4,
428-
unsigned int n_vecs = 2>
430+
unsigned int n_vecs = 2,
431+
bool enable_sg_loadstore = true>
429432
using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
430433
argT,
431434
resT,
432435
AddInplaceFunctor<argT, resT>,
433436
vec_sz,
434-
n_vecs>;
437+
n_vecs,
438+
enable_sg_loadstore>;
435439

436440
template <typename argT, typename resT, typename IndexerT>
437441
using AddInplaceStridedFunctor =

dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,15 @@ template <typename argT, typename resT> struct AngleFunctor
8080
template <typename argTy,
8181
typename resTy = argTy,
8282
unsigned int vec_sz = 4,
83-
unsigned int n_vecs = 2>
83+
unsigned int n_vecs = 2,
84+
bool enable_sg_loadstore = true>
8485
using AngleContigFunctor =
8586
elementwise_common::UnaryContigFunctor<argTy,
8687
resTy,
8788
AngleFunctor<argTy, resTy>,
8889
vec_sz,
89-
n_vecs>;
90+
n_vecs,
91+
enable_sg_loadstore>;
9092

9193
template <typename argTy, typename resTy, typename IndexerT>
9294
using AngleStridedFunctor = elementwise_common::

dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,15 @@ template <typename argT, typename resT> struct AsinFunctor
169169
template <typename argTy,
170170
typename resTy = argTy,
171171
unsigned int vec_sz = 4,
172-
unsigned int n_vecs = 2>
173-
using AsinContigFunctor = elementwise_common::
174-
UnaryContigFunctor<argTy, resTy, AsinFunctor<argTy, resTy>, vec_sz, n_vecs>;
172+
unsigned int n_vecs = 2,
173+
bool enable_sg_loadstore = true>
174+
using AsinContigFunctor =
175+
elementwise_common::UnaryContigFunctor<argTy,
176+
resTy,
177+
AsinFunctor<argTy, resTy>,
178+
vec_sz,
179+
n_vecs,
180+
enable_sg_loadstore>;
175181

176182
template <typename argTy, typename resTy, typename IndexerT>
177183
using AsinStridedFunctor = elementwise_common::

dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,15 @@ template <typename argT, typename resT> struct AsinhFunctor
143143
template <typename argTy,
144144
typename resTy = argTy,
145145
unsigned int vec_sz = 4,
146-
unsigned int n_vecs = 2>
146+
unsigned int n_vecs = 2,
147+
bool enable_sg_loadstore = true>
147148
using AsinhContigFunctor =
148149
elementwise_common::UnaryContigFunctor<argTy,
149150
resTy,
150151
AsinhFunctor<argTy, resTy>,
151152
vec_sz,
152-
n_vecs>;
153+
n_vecs,
154+
enable_sg_loadstore>;
153155

154156
template <typename argTy, typename resTy, typename IndexerT>
155157
using AsinhStridedFunctor = elementwise_common::

dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AtanFunctor
145145
template <typename argTy,
146146
typename resTy = argTy,
147147
unsigned int vec_sz = 4,
148-
unsigned int n_vecs = 2>
149-
using AtanContigFunctor = elementwise_common::
150-
UnaryContigFunctor<argTy, resTy, AtanFunctor<argTy, resTy>, vec_sz, n_vecs>;
148+
unsigned int n_vecs = 2,
149+
bool enable_sg_loadstore = true>
150+
using AtanContigFunctor =
151+
elementwise_common::UnaryContigFunctor<argTy,
152+
resTy,
153+
AtanFunctor<argTy, resTy>,
154+
vec_sz,
155+
n_vecs,
156+
enable_sg_loadstore>;
151157

152158
template <typename argTy, typename resTy, typename IndexerT>
153159
using AtanStridedFunctor = elementwise_common::

dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,16 @@ template <typename argT1,
7070
typename argT2,
7171
typename resT,
7272
unsigned int vec_sz = 4,
73-
unsigned int n_vecs = 2>
73+
unsigned int n_vecs = 2,
74+
bool enable_sg_loadstore = true>
7475
using Atan2ContigFunctor =
7576
elementwise_common::BinaryContigFunctor<argT1,
7677
argT2,
7778
resT,
7879
Atan2Functor<argT1, argT2, resT>,
7980
vec_sz,
80-
n_vecs>;
81+
n_vecs,
82+
enable_sg_loadstore>;
8183

8284
template <typename argT1, typename argT2, typename resT, typename IndexerT>
8385
using Atan2StridedFunctor =

0 commit comments

Comments
 (0)