Skip to content

Commit ea6ae0b

Browse files
authored
Merge pull request #1889 from IntelPython/elementwise-functions-tuning
Elementwise functions tuning
2 parents 691c225 + 88c3e1a commit ea6ae0b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+3097
-988
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
3535
* Fix additional warnings when generating docs [gh-1861](https://github.com/IntelPython/dpctl/pull/1861)
3636
* Add missing include of SYCL header to "math_utils.hpp" [gh-1899](https://github.com/IntelPython/dpctl/pull/1899)
3737
* Add support of CV-qualifiers in `is_complex<T>` helper [gh-1900](https://github.com/IntelPython/dpctl/pull/1900)
38+
* Tuning work for elementwise functions with modest performance gains (under 10%) [gh-1889](https://github.com/IntelPython/dpctl/pull/1889)
3839

3940
## [0.18.1] - Oct. 11, 2024
4041

dpctl/tensor/libtensor/include/kernels/alignment.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ namespace kernels
3030
namespace alignment_utils
3131
{
3232

33-
static constexpr size_t required_alignment = 64;
33+
static constexpr size_t required_alignment = 64UL;
3434

3535
template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
3636
{

dpctl/tensor/libtensor/include/kernels/clip.hpp

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "kernels/alignment.hpp"
3434
#include "utils/math_utils.hpp"
3535
#include "utils/offset_utils.hpp"
36+
#include "utils/sycl_utils.hpp"
3637
#include "utils/type_utils.hpp"
3738

3839
namespace dpctl
@@ -51,6 +52,9 @@ using dpctl::tensor::kernels::alignment_utils::
5152
using dpctl::tensor::kernels::alignment_utils::is_aligned;
5253
using dpctl::tensor::kernels::alignment_utils::required_alignment;
5354

55+
using dpctl::tensor::sycl_utils::sub_group_load;
56+
using dpctl::tensor::sycl_utils::sub_group_store;
57+
5458
template <typename T> T clip(const T &x, const T &min, const T &max)
5559
{
5660
using dpctl::tensor::type_utils::is_complex;
@@ -75,8 +79,8 @@ template <typename T> T clip(const T &x, const T &min, const T &max)
7579
}
7680

7781
template <typename T,
78-
int vec_sz = 4,
79-
int n_vecs = 2,
82+
std::uint8_t vec_sz = 4,
83+
std::uint8_t n_vecs = 2,
8084
bool enable_sg_loadstore = true>
8185
class ClipContigFunctor
8286
{
@@ -100,37 +104,36 @@ class ClipContigFunctor
100104

101105
void operator()(sycl::nd_item<1> ndit) const
102106
{
107+
constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
108+
103109
using dpctl::tensor::type_utils::is_complex;
104110
if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
105-
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
106-
size_t base = ndit.get_global_linear_id();
107-
108-
base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
109-
for (size_t offset = base;
110-
offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
111-
offset += sgSize)
112-
{
111+
const std::uint16_t sgSize =
112+
ndit.get_sub_group().get_local_range()[0];
113+
const size_t gid = ndit.get_global_linear_id();
114+
const uint16_t nelems_per_sg = sgSize * nelems_per_wi;
115+
116+
const size_t start =
117+
(gid / sgSize) * (nelems_per_sg - sgSize) + gid;
118+
const size_t end = std::min(nelems, start + nelems_per_sg);
119+
120+
for (size_t offset = start; offset < end; offset += sgSize) {
113121
dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
114122
}
115123
}
116124
else {
117125
auto sg = ndit.get_sub_group();
118-
std::uint8_t sgSize = sg.get_local_range()[0];
119-
std::uint8_t max_sgSize = sg.get_max_local_range()[0];
120-
size_t base = n_vecs * vec_sz *
121-
(ndit.get_group(0) * ndit.get_local_range(0) +
122-
sg.get_group_id()[0] * max_sgSize);
123-
124-
if (base + n_vecs * vec_sz * sgSize < nelems &&
125-
sgSize == max_sgSize)
126-
{
127-
sycl::vec<T, vec_sz> x_vec;
128-
sycl::vec<T, vec_sz> min_vec;
129-
sycl::vec<T, vec_sz> max_vec;
126+
const std::uint16_t sgSize = sg.get_max_local_range()[0];
127+
128+
const size_t base =
129+
nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
130+
sg.get_group_id()[0] * sgSize);
131+
132+
if (base + nelems_per_wi * sgSize < nelems) {
130133
sycl::vec<T, vec_sz> dst_vec;
131134
#pragma unroll
132135
for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
133-
auto idx = base + it * sgSize;
136+
const size_t idx = base + it * sgSize;
134137
auto x_multi_ptr = sycl::address_space_cast<
135138
sycl::access::address_space::global_space,
136139
sycl::access::decorated::yes>(&x_p[idx]);
@@ -144,21 +147,23 @@ class ClipContigFunctor
144147
sycl::access::address_space::global_space,
145148
sycl::access::decorated::yes>(&dst_p[idx]);
146149

147-
x_vec = sg.load<vec_sz>(x_multi_ptr);
148-
min_vec = sg.load<vec_sz>(min_multi_ptr);
149-
max_vec = sg.load<vec_sz>(max_multi_ptr);
150+
const sycl::vec<T, vec_sz> x_vec =
151+
sub_group_load<vec_sz>(sg, x_multi_ptr);
152+
const sycl::vec<T, vec_sz> min_vec =
153+
sub_group_load<vec_sz>(sg, min_multi_ptr);
154+
const sycl::vec<T, vec_sz> max_vec =
155+
sub_group_load<vec_sz>(sg, max_multi_ptr);
150156
#pragma unroll
151157
for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
152158
dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
153159
max_vec[vec_id]);
154160
}
155-
sg.store<vec_sz>(dst_multi_ptr, dst_vec);
161+
sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
156162
}
157163
}
158164
else {
159-
for (size_t k = base + sg.get_local_id()[0]; k < nelems;
160-
k += sgSize)
161-
{
165+
const size_t lane_id = sg.get_local_id()[0];
166+
for (size_t k = base + lane_id; k < nelems; k += sgSize) {
162167
dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
163168
}
164169
}
@@ -195,8 +200,8 @@ sycl::event clip_contig_impl(sycl::queue &q,
195200
cgh.depends_on(depends);
196201

197202
size_t lws = 64;
198-
constexpr unsigned int vec_sz = 4;
199-
constexpr unsigned int n_vecs = 2;
203+
constexpr std::uint8_t vec_sz = 4;
204+
constexpr std::uint8_t n_vecs = 2;
200205
const size_t n_groups =
201206
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
202207
const auto gws_range = sycl::range<1>(n_groups * lws);

dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "dpctl_tensor_types.hpp"
3232
#include "kernels/alignment.hpp"
3333
#include "utils/offset_utils.hpp"
34+
#include "utils/sycl_utils.hpp"
3435
#include "utils/type_utils.hpp"
3536

3637
namespace dpctl
@@ -49,13 +50,16 @@ using dpctl::tensor::kernels::alignment_utils::
4950
using dpctl::tensor::kernels::alignment_utils::is_aligned;
5051
using dpctl::tensor::kernels::alignment_utils::required_alignment;
5152

53+
using dpctl::tensor::sycl_utils::sub_group_load;
54+
using dpctl::tensor::sycl_utils::sub_group_store;
55+
5256
template <typename srcT, typename dstT, typename IndexerT>
5357
class copy_cast_generic_kernel;
5458

5559
template <typename srcT,
5660
typename dstT,
57-
unsigned int vec_sz,
58-
unsigned int n_vecs>
61+
std::uint8_t vec_sz,
62+
std::uint8_t n_vecs>
5963
class copy_cast_contig_kernel;
6064

6165
template <typename srcT, typename dstT, typename IndexerT>
@@ -207,8 +211,8 @@ template <typename fnT, typename D, typename S> struct CopyAndCastGenericFactory
207211
template <typename srcT,
208212
typename dstT,
209213
typename CastFnT,
210-
int vec_sz = 4,
211-
int n_vecs = 2,
214+
std::uint8_t vec_sz = 4u,
215+
std::uint8_t n_vecs = 2u,
212216
bool enable_sg_loadstore = true>
213217
class ContigCopyFunctor
214218
{
@@ -227,58 +231,55 @@ class ContigCopyFunctor
227231
{
228232
CastFnT fn{};
229233

234+
constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
235+
230236
using dpctl::tensor::type_utils::is_complex;
231237
if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
232238
is_complex<dstT>::value)
233239
{
234-
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
235-
size_t base = ndit.get_global_linear_id();
236-
237-
base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
238-
for (size_t offset = base;
239-
offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
240-
offset += sgSize)
241-
{
240+
std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
241+
const size_t gid = ndit.get_global_linear_id();
242+
243+
// start = (gid / sgSize) * elems_per_sg + (gid % sgSize)
244+
const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
245+
const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid;
246+
const size_t end = std::min(nelems, start + elems_per_sg);
247+
for (size_t offset = start; offset < end; offset += sgSize) {
242248
dst_p[offset] = fn(src_p[offset]);
243249
}
244250
}
245251
else {
246252
auto sg = ndit.get_sub_group();
247-
std::uint8_t sgSize = sg.get_local_range()[0];
248-
std::uint8_t max_sgSize = sg.get_max_local_range()[0];
249-
size_t base = n_vecs * vec_sz *
250-
(ndit.get_group(0) * ndit.get_local_range(0) +
251-
sg.get_group_id()[0] * max_sgSize);
252-
253-
if (base + n_vecs * vec_sz * sgSize < nelems &&
254-
sgSize == max_sgSize)
255-
{
256-
sycl::vec<srcT, vec_sz> src_vec;
253+
const std::uint16_t sgSize = sg.get_max_local_range()[0];
254+
const size_t base =
255+
elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
256+
sg.get_group_id()[0] * sgSize);
257+
258+
if (base + elems_per_wi * sgSize < nelems) {
257259
sycl::vec<dstT, vec_sz> dst_vec;
258260

259261
#pragma unroll
260262
for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
263+
const size_t offset = base + it * sgSize;
261264
auto src_multi_ptr = sycl::address_space_cast<
262265
sycl::access::address_space::global_space,
263-
sycl::access::decorated::yes>(
264-
&src_p[base + it * sgSize]);
266+
sycl::access::decorated::yes>(&src_p[offset]);
265267
auto dst_multi_ptr = sycl::address_space_cast<
266268
sycl::access::address_space::global_space,
267-
sycl::access::decorated::yes>(
268-
&dst_p[base + it * sgSize]);
269+
sycl::access::decorated::yes>(&dst_p[offset]);
269270

270-
src_vec = sg.load<vec_sz>(src_multi_ptr);
271+
const sycl::vec<srcT, vec_sz> src_vec =
272+
sub_group_load<vec_sz>(sg, src_multi_ptr);
271273
#pragma unroll
272274
for (std::uint8_t k = 0; k < vec_sz; k++) {
273275
dst_vec[k] = fn(src_vec[k]);
274276
}
275-
sg.store<vec_sz>(dst_multi_ptr, dst_vec);
277+
sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
276278
}
277279
}
278280
else {
279-
for (size_t k = base + sg.get_local_id()[0]; k < nelems;
280-
k += sgSize)
281-
{
281+
const size_t start = base + sg.get_local_id()[0];
282+
for (size_t k = start; k < nelems; k += sgSize) {
282283
dst_p[k] = fn(src_p[k]);
283284
}
284285
}
@@ -332,8 +333,8 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
332333
dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
333334

334335
size_t lws = 64;
335-
constexpr unsigned int vec_sz = 4;
336-
constexpr unsigned int n_vecs = 2;
336+
constexpr std::uint32_t vec_sz = 4;
337+
constexpr std::uint32_t n_vecs = 2;
337338
const size_t n_groups =
338339
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
339340
const auto gws_range = sycl::range<1>(n_groups * lws);

0 commit comments

Comments
 (0)