From 20a1966a08fd4d129ac3307a734d14de5e519ac1 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 21 Oct 2024 05:26:18 -0500 Subject: [PATCH 1/4] Do not use sg.get_local_range Use sg.get_max_local_range instead. The `sg.get_local_range` must perform lots of checks to determine if this is the last trailing sub-group in the work-group and its actual size may be smaller. We set the local work-group size to be 128, which is a multiple of any sub-group size, and hence get_local_range() always equals to get_max_local_raneg(). --- .../kernels/elementwise_functions/common.hpp | 58 +++++++------------ .../elementwise_functions/common_inplace.hpp | 18 +++--- 2 files changed, 29 insertions(+), 47 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index ee955dcde5..526bbb7bde 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -72,7 +72,7 @@ struct UnaryContigFunctor { UnaryOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ - /* NOTE: vec_sz must divide sg.max_local_range()[0] */ + /* NOTE: work-group size must be divisible by sub-group size */ if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value) { @@ -80,14 +80,11 @@ struct UnaryContigFunctor constexpr resT const_val = UnaryOperatorT::constant_value; auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t max_sgSize = sg.get_max_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_ && - max_sgSize == sgSize) - { + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec res_vec(const_val); #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { @@ -113,14 +110,11 @@ struct UnaryContigFunctor UnaryOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); - std::uint16_t sgSize = sg.get_local_range()[0]; - std::uint16_t max_sgSize = sg.get_max_local_range()[0]; + std::uint16_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + - sg.get_group_id()[0] * max_sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_ && - sgSize == max_sgSize) - { + sg.get_group_id()[0] * sgSize); + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec x; #pragma unroll @@ -155,15 +149,12 @@ struct UnaryContigFunctor // default: use scalar-value function auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + - sg.get_group_id()[0] * maxsgSize); + sg.get_group_id()[0] * sgSize); - if ((base + n_vecs * vec_sz * sgSize < nelems_) && - (maxsgSize == sgSize)) - { + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec arg_vec; #pragma unroll @@ -199,15 +190,12 @@ struct UnaryContigFunctor // default: use scalar-value function auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + - sg.get_group_id()[0] * maxsgSize); + sg.get_group_id()[0] * sgSize); - if ((base + n_vecs * vec_sz * sgSize < nelems_) && - (maxsgSize == sgSize)) - { + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec arg_vec; sycl::vec res_vec; @@ -406,22 +394,20 @@ struct BinaryContigFunctor { BinaryOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ + /* NOTE: work-group size must be divisible by sub-group size */ if constexpr (enable_sg_loadstore && BinaryOperatorT::supports_sg_loadstore::value && BinaryOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if ((base + n_vecs * vec_sz * sgSize < nelems_) && - (sgSize == maxsgSize)) - { + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; @@ -458,16 +444,13 @@ struct BinaryContigFunctor BinaryOperatorT::supports_sg_loadstore::value) { auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if ((base + n_vecs * vec_sz * sgSize < nelems_) && - (sgSize == maxsgSize)) - { + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; @@ -582,13 +565,15 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor void operator()(sycl::nd_item<1> ndit) const { + /* NOTE: work-group size must be divisible by sub-group size */ + BinaryOperatorT op{}; static_assert(BinaryOperatorT::supports_sg_loadstore::value); auto sg = ndit.get_sub_group(); size_t gid = ndit.get_global_linear_id(); - std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { @@ -647,13 +632,14 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor void operator()(sycl::nd_item<1> ndit) const { + /* NOTE: work-group size must be divisible by sub-group size */ BinaryOperatorT op{}; static_assert(BinaryOperatorT::supports_sg_loadstore::value); auto sg = ndit.get_sub_group(); size_t gid = ndit.get_global_linear_id(); - std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index cbb079e3c5..8ece9ff910 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -73,22 +73,20 @@ struct BinaryInplaceContigFunctor { BinaryInplaceOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ + /* NB: Workgroup size must be divisible by sub-group size */ if constexpr (enable_sg_loadstore && BinaryInplaceOperatorT::supports_sg_loadstore::value && BinaryInplaceOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if ((base + n_vecs * vec_sz * sgSize < nelems_) && - (sgSize == maxsgSize)) - { + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec arg_vec; sycl::vec res_vec; @@ -121,16 +119,13 @@ struct BinaryInplaceContigFunctor BinaryInplaceOperatorT::supports_sg_loadstore::value) { auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = n_vecs * vec_sz * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if ((base + n_vecs * vec_sz * sgSize < nelems_) && - (sgSize == maxsgSize)) - { + if (base + n_vecs * vec_sz * sgSize < nelems_) { sycl::vec arg_vec; sycl::vec res_vec; @@ -228,13 +223,14 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor void operator()(sycl::nd_item<1> ndit) const { + /* Workgroup size is expected to be a multiple of sub-group size */ BinaryOperatorT op{}; static_assert(BinaryOperatorT::supports_sg_loadstore::value); auto sg = ndit.get_sub_group(); size_t gid = ndit.get_global_linear_id(); - std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t sgSize = sg.get_max_local_range()[0]; size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { From 9240d5be22b10e915dd2e921090b41b748b86423 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 21 Oct 2024 05:29:18 -0500 Subject: [PATCH 2/4] Add UL suffix literal integral value --- dpctl/tensor/libtensor/include/kernels/alignment.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp index ff4541af4d..9ec14dd027 100644 --- a/dpctl/tensor/libtensor/include/kernels/alignment.hpp +++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp @@ -30,7 +30,7 @@ namespace kernels namespace alignment_utils { -static constexpr size_t required_alignment = 64; +static constexpr size_t required_alignment = 64UL; template bool is_aligned(Ptr p) { From f623209115a4e201ffbd05c797db512643e61f6c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 24 Oct 2024 09:38:10 -0500 Subject: [PATCH 3/4] Clean-ups in binary/unary contig call operator --- .../kernels/elementwise_functions/common.hpp | 100 +++++++++--------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 526bbb7bde..4c27fefa0f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -70,6 +70,7 @@ struct UnaryContigFunctor void operator()(sycl::nd_item<1> ndit) const { + constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz; UnaryOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ /* NOTE: work-group size must be divisible by sub-group size */ @@ -80,14 +81,15 @@ struct UnaryContigFunctor constexpr resT const_val = UnaryOperatorT::constant_value; auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + std::uint32_t sgSize = sg.get_max_local_range()[0]; + + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec res_vec(const_val); #pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { size_t offset = base + static_cast(it) * static_cast(sgSize); auto out_multi_ptr = sycl::address_space_cast< @@ -98,9 +100,8 @@ struct UnaryContigFunctor } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = const_val; } } @@ -110,15 +111,16 @@ struct UnaryContigFunctor UnaryOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); - std::uint16_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + std::uint32_t sgSize = sg.get_max_local_range()[0]; + + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec x; #pragma unroll - for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { size_t offset = base + static_cast(it) * static_cast(sgSize); auto in_multi_ptr = sycl::address_space_cast< @@ -134,9 +136,8 @@ struct UnaryContigFunctor } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { // scalar call out[k] = op(in[k]); } @@ -149,16 +150,16 @@ struct UnaryContigFunctor // default: use scalar-value function auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + std::uint32_t sgSize = sg.get_max_local_range()[0]; + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec arg_vec; #pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { size_t offset = base + static_cast(it) * static_cast(sgSize); auto in_multi_ptr = sycl::address_space_cast< @@ -170,16 +171,15 @@ struct UnaryContigFunctor arg_vec = sg.load(in_multi_ptr); #pragma unroll - for (std::uint8_t k = 0; k < vec_sz; ++k) { + for (std::uint32_t k = 0; k < vec_sz; ++k) { arg_vec[k] = op(arg_vec[k]); } sg.store(out_multi_ptr, arg_vec); } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in[k]); } } @@ -190,17 +190,17 @@ struct UnaryContigFunctor // default: use scalar-value function auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + std::uint32_t sgSize = sg.get_max_local_range()[0]; + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { size_t offset = base + static_cast(it) * static_cast(sgSize); auto in_multi_ptr = sycl::address_space_cast< @@ -212,27 +212,27 @@ struct UnaryContigFunctor arg_vec = sg.load(in_multi_ptr); #pragma unroll - for (std::uint8_t k = 0; k < vec_sz; ++k) { + for (std::uint32_t k = 0; k < vec_sz; ++k) { res_vec[k] = op(arg_vec[k]); } sg.store(out_multi_ptr, res_vec); } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in[k]); } } } else { - std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t sgSize = ndit.get_sub_group().get_local_range()[0]; size_t base = ndit.get_global_linear_id(); + const size_t elems_per_sg = sgSize * elems_per_wi; - base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + base = (base / sgSize) * elems_per_sg + (base % sgSize); for (size_t offset = base; - offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz)); + offset < std::min(nelems_, base + elems_per_sg); offset += sgSize) { out[offset] = op(in[offset]); @@ -392,6 +392,7 @@ struct BinaryContigFunctor void operator()(sycl::nd_item<1> ndit) const { + constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz; BinaryOperatorT op{}; /* Each work-item processes vec_sz elements, contiguous in memory */ /* NOTE: work-group size must be divisible by sub-group size */ @@ -401,19 +402,19 @@ struct BinaryContigFunctor BinaryOperatorT::supports_vec::value) { auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_max_local_range()[0]; + std::uint16_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; #pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { size_t offset = base + static_cast(it) * static_cast(sgSize); auto in1_multi_ptr = sycl::address_space_cast< @@ -433,9 +434,8 @@ struct BinaryContigFunctor } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const std::size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in1[k], in2[k]); } } @@ -446,17 +446,17 @@ struct BinaryContigFunctor auto sg = ndit.get_sub_group(); std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; #pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { size_t offset = base + static_cast(it) * static_cast(sgSize); auto in1_multi_ptr = sycl::address_space_cast< @@ -480,20 +480,20 @@ struct BinaryContigFunctor } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const std::size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { out[k] = op(in1[k], in2[k]); } } } else { - std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; - size_t base = ndit.get_global_linear_id(); + const size_t sgSize = ndit.get_sub_group().get_local_range()[0]; + const size_t gid = ndit.get_global_linear_id(); + const size_t elems_per_sg = sgSize * elems_per_wi; - base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + const size_t base = (gid / sgSize) * elems_per_sg + (gid % sgSize); for (size_t offset = base; - offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz)); + offset < std::min(nelems_, base + elems_per_sg); offset += sgSize) { out[offset] = op(in1[offset], in2[offset]); From 123590255e987d29cbaf439da2e2841e5281fd6c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 24 Oct 2024 15:44:34 -0500 Subject: [PATCH 4/4] Change vec_sz and n_vecs settings for contiguous kernels For short data types, each work-item may need to load several elements to ensure that it uses all the data from cache-line. For example, with simd32, we load 4 8-bit types (2 cache lines), 2 16-bit types, 1 32-bit and wider types. n_vec is set to 1, to avoid cache thrashing due to second iteration of some work-items beginning to access memory at higher addresses while some work-items continue working on the lower addresses causing cache evictions. The size of the work-groups was increated from 128 to 256, which is chosen so that all 8 threads of single vector with simd32 are used. --- .../kernels/elementwise_functions/abs.hpp | 9 ++- .../kernels/elementwise_functions/acos.hpp | 9 ++- .../kernels/elementwise_functions/acosh.hpp | 9 ++- .../kernels/elementwise_functions/add.hpp | 12 ++-- .../kernels/elementwise_functions/angle.hpp | 9 ++- .../kernels/elementwise_functions/asin.hpp | 9 ++- .../kernels/elementwise_functions/asinh.hpp | 9 ++- .../kernels/elementwise_functions/atan.hpp | 9 ++- .../kernels/elementwise_functions/atan2.hpp | 8 ++- .../kernels/elementwise_functions/atanh.hpp | 9 ++- .../elementwise_functions/bitwise_and.hpp | 12 ++-- .../elementwise_functions/bitwise_invert.hpp | 7 ++- .../bitwise_left_shift.hpp | 12 ++-- .../elementwise_functions/bitwise_or.hpp | 12 ++-- .../bitwise_right_shift.hpp | 12 ++-- .../elementwise_functions/bitwise_xor.hpp | 12 ++-- .../kernels/elementwise_functions/cbrt.hpp | 8 ++- .../kernels/elementwise_functions/ceil.hpp | 7 ++- .../kernels/elementwise_functions/common.hpp | 56 +++++++++--------- .../elementwise_functions/common_inplace.hpp | 50 ++++++++-------- .../kernels/elementwise_functions/conj.hpp | 9 ++- .../elementwise_functions/copysign.hpp | 8 ++- .../kernels/elementwise_functions/cos.hpp | 9 ++- .../kernels/elementwise_functions/cosh.hpp | 9 ++- .../kernels/elementwise_functions/equal.hpp | 8 ++- .../kernels/elementwise_functions/exp.hpp | 9 ++- .../kernels/elementwise_functions/exp2.hpp | 9 ++- .../kernels/elementwise_functions/expm1.hpp | 9 ++- .../kernels/elementwise_functions/floor.hpp | 9 ++- .../elementwise_functions/floor_divide.hpp | 12 ++-- .../kernels/elementwise_functions/greater.hpp | 8 ++- .../elementwise_functions/greater_equal.hpp | 8 ++- .../kernels/elementwise_functions/hypot.hpp | 8 ++- .../kernels/elementwise_functions/imag.hpp | 9 ++- .../elementwise_functions/isfinite.hpp | 7 ++- .../kernels/elementwise_functions/isinf.hpp | 8 ++- .../kernels/elementwise_functions/isnan.hpp | 8 ++- .../kernels/elementwise_functions/less.hpp | 10 +++- .../elementwise_functions/less_equal.hpp | 8 ++- .../kernels/elementwise_functions/log.hpp | 9 ++- .../kernels/elementwise_functions/log10.hpp | 9 ++- .../kernels/elementwise_functions/log1p.hpp | 9 ++- .../kernels/elementwise_functions/log2.hpp | 9 ++- .../elementwise_functions/logaddexp.hpp | 9 ++- .../elementwise_functions/logical_and.hpp | 10 +++- .../elementwise_functions/logical_not.hpp | 9 ++- .../elementwise_functions/logical_or.hpp | 10 +++- .../elementwise_functions/logical_xor.hpp | 10 +++- .../kernels/elementwise_functions/maximum.hpp | 10 +++- .../kernels/elementwise_functions/minimum.hpp | 10 +++- .../elementwise_functions/multiply.hpp | 14 +++-- .../elementwise_functions/negative.hpp | 9 ++- .../elementwise_functions/nextafter.hpp | 8 ++- .../elementwise_functions/not_equal.hpp | 10 +++- .../elementwise_functions/positive.hpp | 9 ++- .../kernels/elementwise_functions/pow.hpp | 14 +++-- .../kernels/elementwise_functions/proj.hpp | 9 ++- .../kernels/elementwise_functions/real.hpp | 9 ++- .../elementwise_functions/reciprocal.hpp | 9 ++- .../elementwise_functions/remainder.hpp | 14 +++-- .../kernels/elementwise_functions/round.hpp | 9 ++- .../kernels/elementwise_functions/rsqrt.hpp | 10 +++- .../kernels/elementwise_functions/sign.hpp | 9 ++- .../kernels/elementwise_functions/signbit.hpp | 8 ++- .../kernels/elementwise_functions/sin.hpp | 9 ++- .../kernels/elementwise_functions/sinh.hpp | 9 ++- .../kernels/elementwise_functions/sqrt.hpp | 9 ++- .../kernels/elementwise_functions/square.hpp | 8 ++- .../elementwise_functions/subtract.hpp | 14 +++-- .../kernels/elementwise_functions/tan.hpp | 9 ++- .../kernels/elementwise_functions/tanh.hpp | 9 ++- .../elementwise_functions/true_divide.hpp | 14 +++-- .../kernels/elementwise_functions/trunc.hpp | 9 ++- .../elementwise_functions/vec_size_util.hpp | 58 +++++++++++++++++++ 74 files changed, 573 insertions(+), 265 deletions(-) create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp index 411040bada..220d31b687 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -32,9 +32,11 @@ #include #include "cabs_impl.hpp" -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -50,6 +52,7 @@ namespace abs namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AbsFunctor @@ -89,8 +92,8 @@ template struct AbsFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AbsContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace acos namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AcosFunctor @@ -128,8 +131,8 @@ template struct AcosFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AcosContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace acosh namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AcoshFunctor @@ -155,8 +158,8 @@ template struct AcoshFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AcoshContigFunctor = elementwise_common::UnaryContigFunctor #include "sycl_complex.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -50,6 +52,8 @@ namespace add namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct AddFunctor { @@ -110,8 +114,8 @@ template struct AddFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AddContigFunctor = elementwise_common::BinaryContigFunctor struct AddInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index 034b71438f..4f36ef595f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -30,10 +30,12 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace angle namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AngleFunctor @@ -74,8 +77,8 @@ template struct AngleFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AngleContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace asin namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AsinFunctor @@ -148,8 +151,8 @@ template struct AsinFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AsinContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace asinh namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AsinhFunctor @@ -131,8 +134,8 @@ template struct AsinhFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AsinhContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace atan namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AtanFunctor @@ -138,8 +141,8 @@ template struct AtanFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AtanContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,8 @@ namespace atan2 namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct Atan2Functor { @@ -68,8 +72,8 @@ template struct Atan2Functor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using Atan2ContigFunctor = elementwise_common::BinaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace atanh namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct AtanhFunctor @@ -132,8 +135,8 @@ template struct AtanhFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using AtanhContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,8 @@ namespace bitwise_and namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct BitwiseAndFunctor { @@ -91,8 +95,8 @@ struct BitwiseAndFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -290,8 +294,8 @@ template struct BitwiseAndInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseAndInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index d6c1bc72db..1d23e86ab7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -30,6 +30,8 @@ #include #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace bitwise_invert namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::vec_cast; template struct BitwiseInvertFunctor @@ -80,8 +83,8 @@ template struct BitwiseInvertFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseInvertContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,8 @@ namespace bitwise_left_shift namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct BitwiseLeftShiftFunctor { @@ -100,8 +104,8 @@ struct BitwiseLeftShiftFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -304,8 +308,8 @@ template struct BitwiseLeftShiftInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseLeftShiftInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index 71f3e809d9..959be48395 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -28,6 +28,8 @@ #include #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,8 @@ namespace bitwise_or namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct BitwiseOrFunctor { static_assert(std::is_same_v); @@ -90,8 +94,8 @@ template struct BitwiseOrFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -286,8 +290,8 @@ template struct BitwiseOrInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseOrInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index e4dfee2ed6..8ecc5a5564 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -29,6 +29,8 @@ #include #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,8 @@ namespace bitwise_right_shift namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct BitwiseRightShiftFunctor { @@ -101,8 +105,8 @@ struct BitwiseRightShiftFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -308,8 +312,8 @@ template struct BitwiseRightShiftInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseRightShiftInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index d035b31170..2356a9e470 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -28,6 +28,8 @@ #include #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,8 @@ namespace bitwise_xor namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct BitwiseXorFunctor { @@ -91,8 +95,8 @@ struct BitwiseXorFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -290,8 +294,8 @@ template struct BitwiseXorInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using BitwiseXorInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp index 4f2634f17a..5892da7564 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -30,6 +30,8 @@ #include #include +#include "vec_size_util.hpp" + #include "kernels/elementwise_functions/common.hpp" #include "kernels/dpctl_tensor_types.hpp" @@ -48,6 +50,8 @@ namespace cbrt namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct CbrtFunctor { @@ -65,8 +69,8 @@ template struct CbrtFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using CbrtContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "kernels/elementwise_functions/common.hpp" #include "kernels/dpctl_tensor_types.hpp" @@ -47,6 +49,7 @@ namespace ceil namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct CeilFunctor @@ -78,8 +81,8 @@ template struct CeilFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using CeilContigFunctor = elementwise_common::UnaryContigFunctor struct UnaryContigFunctor { @@ -108,7 +108,7 @@ struct UnaryContigFunctor } else if constexpr (enable_sg_loadstore && UnaryOperatorT::supports_sg_loadstore::value && - UnaryOperatorT::supports_vec::value) + UnaryOperatorT::supports_vec::value && (vec_sz > 1)) { auto sg = ndit.get_sub_group(); std::uint32_t sgSize = sg.get_max_local_range()[0]; @@ -280,8 +280,8 @@ template class kernel_name, - unsigned int vec_sz = 4, - unsigned int n_vecs = 2> + unsigned int vec_sz = 1, + unsigned int n_vecs = 1> sycl::event unary_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, @@ -291,7 +291,9 @@ sycl::event unary_contig_impl(sycl::queue &exec_q, sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - const size_t lws = 128; + // Choose work-group size to occupy all threads of since vector core + // busy (8 threads, simd32) + const size_t lws = 256; const size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); @@ -370,8 +372,8 @@ template struct BinaryContigFunctor { @@ -399,7 +401,7 @@ struct BinaryContigFunctor if constexpr (enable_sg_loadstore && BinaryOperatorT::supports_sg_loadstore::value && - BinaryOperatorT::supports_vec::value) + BinaryOperatorT::supports_vec::value && (vec_sz > 1)) { auto sg = ndit.get_sub_group(); std::uint16_t sgSize = sg.get_max_local_range()[0]; @@ -570,11 +572,11 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor BinaryOperatorT op{}; static_assert(BinaryOperatorT::supports_sg_loadstore::value); - auto sg = ndit.get_sub_group(); - size_t gid = ndit.get_global_linear_id(); + const auto &sg = ndit.get_sub_group(); + const size_t gid = ndit.get_global_linear_id(); - std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = gid - sg.get_local_id()[0]; + const size_t sgSize = sg.get_max_local_range()[0]; + const size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { auto in1_multi_ptr = sycl::address_space_cast< @@ -597,9 +599,8 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor sg.store(out_multi_ptr, res_el); } else { - for (size_t k = base + sg.get_local_id()[0]; k < n_elems; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < n_elems; k += sgSize) { res[k] = op(mat[k], padded_vec[k % n1]); } } @@ -636,11 +637,11 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor BinaryOperatorT op{}; static_assert(BinaryOperatorT::supports_sg_loadstore::value); - auto sg = ndit.get_sub_group(); + const auto &sg = ndit.get_sub_group(); size_t gid = ndit.get_global_linear_id(); - std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = gid - sg.get_local_id()[0]; + const size_t sgSize = sg.get_max_local_range()[0]; + const size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { auto in1_multi_ptr = sycl::address_space_cast< @@ -663,9 +664,8 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor sg.store(out_multi_ptr, res_el); } else { - for (size_t k = base + sg.get_local_id()[0]; k < n_elems; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < n_elems; k += sgSize) { res[k] = op(padded_vec[k % n1], mat[k]); } } @@ -761,8 +761,8 @@ template class kernel_name, - unsigned int vec_sz = 4, - unsigned int n_vecs = 2> + unsigned int vec_sz = 1, + unsigned int n_vecs = 1> sycl::event binary_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, @@ -776,7 +776,9 @@ sycl::event binary_contig_impl(sycl::queue &exec_q, sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - const size_t lws = 128; + // Choose work-group size to occupy all threads of since vector core + // busy (8 threads, simd32) + const size_t lws = 256; const size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); @@ -916,7 +918,7 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl( // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to // ensure that reads are accessible - const size_t lws = 128; + const size_t lws = 256; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(make_padded_vec_ev); @@ -997,7 +999,7 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl( // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to // ensure that reads are accessible - const size_t lws = 128; + const size_t lws = 256; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(make_padded_vec_ev); diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index 8ece9ff910..f01468247e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -51,8 +51,8 @@ using dpctl::tensor::kernels::alignment_utils::required_alignment; template struct BinaryInplaceContigFunctor { @@ -72,27 +72,29 @@ struct BinaryInplaceContigFunctor void operator()(sycl::nd_item<1> ndit) const { BinaryInplaceOperatorT op{}; + constexpr std::uint32_t elems_per_wi = vec_sz * n_vecs; /* Each work-item processes vec_sz elements, contiguous in memory */ /* NB: Workgroup size must be divisible by sub-group size */ if constexpr (enable_sg_loadstore && BinaryInplaceOperatorT::supports_sg_loadstore::value && - BinaryInplaceOperatorT::supports_vec::value) + BinaryInplaceOperatorT::supports_vec::value && + (vec_sz > 1)) { auto sg = ndit.get_sub_group(); std::uint8_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { auto rhs_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&rhs[base + it * sgSize]); @@ -108,9 +110,8 @@ struct BinaryInplaceContigFunctor } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { op(lhs[k], rhs[k]); } } @@ -119,18 +120,18 @@ struct BinaryInplaceContigFunctor BinaryInplaceOperatorT::supports_sg_loadstore::value) { auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_max_local_range()[0]; + std::uint32_t sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * + size_t base = static_cast(elems_per_wi) * (ndit.get_group(0) * ndit.get_local_range(0) + sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * sgSize < nelems_) { + if (base + static_cast(elems_per_wi * sgSize) < nelems_) { sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) { auto rhs_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&rhs[base + it * sgSize]); @@ -141,27 +142,27 @@ struct BinaryInplaceContigFunctor arg_vec = sg.load(rhs_multi_ptr); res_vec = sg.load(lhs_multi_ptr); #pragma unroll - for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + for (std::uint32_t vec_id = 0; vec_id < vec_sz; ++vec_id) { op(res_vec[vec_id], arg_vec[vec_id]); } sg.store(lhs_multi_ptr, res_vec); } } else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) - { + const size_t lane_id = sg.get_local_id()[0]; + for (size_t k = base + lane_id; k < nelems_; k += sgSize) { op(lhs[k], rhs[k]); } } } else { - std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + const size_t sgSize = ndit.get_sub_group().get_local_range()[0]; size_t base = ndit.get_global_linear_id(); + const size_t elems_per_sg = elems_per_wi * sgSize; - base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + base = (base / sgSize) * elems_per_sg + (base % sgSize); for (size_t offset = base; - offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz)); + offset < std::min(nelems_, base + elems_per_sg); offset += sgSize) { op(lhs[offset], rhs[offset]); @@ -303,8 +304,8 @@ template class kernel_name, - unsigned int vec_sz = 4, - unsigned int n_vecs = 2> + unsigned int vec_sz = 1, + unsigned int n_vecs = 1> sycl::event binary_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, @@ -317,7 +318,8 @@ binary_inplace_contig_impl(sycl::queue &exec_q, sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - const size_t lws = 128; + // choose WG as n_threads_per_core * simd_width = 8 * 32 + const size_t lws = 256; const size_t n_groups = ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); const auto gws_range = sycl::range<1>(n_groups * lws); @@ -438,7 +440,7 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to // ensure that reads are accessible - const size_t lws = 128; + const size_t lws = 256; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(make_padded_vec_ev); diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index 4953feedb2..56f856f0c9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -31,10 +31,12 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -50,6 +52,7 @@ namespace conj namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct ConjFunctor @@ -82,8 +85,8 @@ template struct ConjFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using ConjContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,8 @@ namespace copysign namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct CopysignFunctor { @@ -82,8 +86,8 @@ template struct CopysignFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using CopysignContigFunctor = elementwise_common::BinaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace cos namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct CosFunctor @@ -163,8 +166,8 @@ template struct CosFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using CosContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace cosh namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct CoshFunctor @@ -153,8 +156,8 @@ template struct CoshFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using CoshContigFunctor = elementwise_common::UnaryContigFunctor #include "sycl_complex.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,8 @@ namespace equal namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct EqualFunctor { static_assert(std::is_same_v); @@ -119,8 +123,8 @@ template struct EqualFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using EqualContigFunctor = elementwise_common::BinaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace exp namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct ExpFunctor @@ -122,8 +125,8 @@ template struct ExpFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using ExpContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace exp2 namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct Exp2Functor @@ -124,8 +127,8 @@ template struct Exp2Functor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using Exp2ContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace expm1 namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct Expm1Functor @@ -136,8 +139,8 @@ template struct Expm1Functor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using Expm1ContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -47,6 +49,7 @@ namespace floor namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct FloorFunctor @@ -78,8 +81,8 @@ template struct FloorFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using FloorContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,8 @@ namespace floor_divide namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct FloorDivideFunctor { @@ -126,8 +130,8 @@ struct FloorDivideFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -367,8 +371,8 @@ template struct FloorDivideInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using FloorDivideInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp index 05c2a36b0c..e1814110be 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -30,6 +30,8 @@ #include #include +#include "vec_size_util.hpp" + #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" @@ -50,6 +52,8 @@ namespace greater namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct GreaterFunctor { static_assert(std::is_same_v); @@ -120,8 +124,8 @@ template struct GreaterFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using GreaterContigFunctor = elementwise_common::BinaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" @@ -50,6 +52,8 @@ namespace greater_equal namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct GreaterEqualFunctor { @@ -121,8 +125,8 @@ struct GreaterEqualFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor< argT1, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp index c5b68644a9..09c2055697 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -29,6 +29,8 @@ #include #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,8 @@ namespace hypot namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct HypotFunctor { @@ -84,8 +88,8 @@ template struct HypotFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using HypotContigFunctor = elementwise_common::BinaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace imag namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct ImagFunctor @@ -78,8 +81,8 @@ template struct ImagFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using ImagContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -45,6 +47,7 @@ namespace isfinite namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -97,8 +100,8 @@ template struct IsFiniteFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using IsFiniteContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "kernels/dpctl_tensor_types.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -46,6 +49,7 @@ namespace isinf namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -96,8 +100,8 @@ template struct IsInfFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using IsInfContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "kernels/dpctl_tensor_types.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -45,6 +48,7 @@ namespace isnan namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -94,8 +98,8 @@ template struct IsNanFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using IsNanContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -49,6 +51,8 @@ namespace less namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct LessFunctor { static_assert(std::is_same_v); @@ -118,8 +122,8 @@ template struct LessFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LessContigFunctor = elementwise_common::BinaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" @@ -49,6 +51,8 @@ namespace less_equal namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct LessEqualFunctor { static_assert(std::is_same_v); @@ -119,8 +123,8 @@ template struct LessEqualFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor< argT1, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp index a3e28ef5d7..bb32310cf6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -30,10 +30,12 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace log namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct LogFunctor @@ -78,8 +81,8 @@ template struct LogFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LogContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -50,6 +52,7 @@ namespace log10 namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -97,8 +100,8 @@ template struct Log10Functor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using Log10ContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace log1p namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; // TODO: evaluate precision against alternatives @@ -102,8 +105,8 @@ template struct Log1pFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using Log1pContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -50,6 +52,7 @@ namespace log2 namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -98,8 +101,8 @@ template struct Log2Functor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using Log2ContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -51,6 +53,7 @@ namespace logaddexp namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -99,8 +102,8 @@ template struct LogAddExpFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor< argT1, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp index f15caa02e6..fde8cde51d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -30,11 +30,13 @@ #include #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -49,6 +51,8 @@ namespace logical_and namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct LogicalAndFunctor { @@ -93,8 +97,8 @@ struct LogicalAndFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor< argT1, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp index 7c83e07072..3268b0cc33 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -30,7 +30,10 @@ #include #include +#include "vec_size_util.hpp" + #include "kernels/dpctl_tensor_types.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -47,6 +50,8 @@ namespace logical_not namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct LogicalNotFunctor { static_assert(std::is_same_v); @@ -66,8 +71,8 @@ template struct LogicalNotFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LogicalNotContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -49,6 +51,8 @@ namespace logical_or namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct LogicalOrFunctor { static_assert(std::is_same_v); @@ -92,8 +96,8 @@ template struct LogicalOrFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor< argT1, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp index dc41760985..e4e56b40cd 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -30,11 +30,13 @@ #include #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -49,6 +51,8 @@ namespace logical_xor namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct LogicalXorFunctor { @@ -94,8 +98,8 @@ struct LogicalXorFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor< argT1, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp index e73704bad8..5ca76925bf 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -29,12 +29,14 @@ #include #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -49,6 +51,8 @@ namespace maximum namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct MaximumFunctor { @@ -96,8 +100,8 @@ template struct MaximumFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using MaximumContigFunctor = elementwise_common::BinaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -49,6 +51,8 @@ namespace minimum namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct MinimumFunctor { @@ -96,8 +100,8 @@ template struct MinimumFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using MinimumContigFunctor = elementwise_common::BinaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" @@ -51,6 +53,8 @@ namespace multiply namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct MultiplyFunctor { @@ -98,8 +102,8 @@ template struct MultiplyFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using MultiplyContigFunctor = elementwise_common::BinaryContigFunctor struct MultiplyInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using MultiplyInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp index 83f17dd47b..7e78ce73f3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -30,9 +30,11 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace negative namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -65,8 +68,8 @@ template struct NegativeFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using NegativeContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,8 @@ namespace nextafter namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct NextafterFunctor { @@ -82,8 +86,8 @@ template struct NextafterFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using NextafterContigFunctor = elementwise_common::BinaryContigFunctor< argT1, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp index c1b920193b..591d9e11a3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -29,11 +29,13 @@ #include #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -48,6 +50,8 @@ namespace not_equal namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct NotEqualFunctor { static_assert(std::is_same_v); @@ -103,8 +107,8 @@ template struct NotEqualFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using NotEqualContigFunctor = elementwise_common::BinaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace positive namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -80,8 +83,8 @@ template struct PositiveFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using PositiveContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" @@ -51,6 +53,8 @@ namespace pow namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct PowFunctor { @@ -151,8 +155,8 @@ template struct PowFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using PowContigFunctor = elementwise_common::BinaryContigFunctor struct PowInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< argT, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp index 2c3dce0c9c..a9812049fe 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -32,9 +32,11 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -50,6 +52,7 @@ namespace proj namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct ProjFunctor @@ -91,8 +94,8 @@ template struct ProjFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using ProjContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace real namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct RealFunctor @@ -78,8 +81,8 @@ template struct RealFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using RealContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -51,6 +53,7 @@ namespace reciprocal namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct ReciprocalFunctor @@ -81,8 +84,8 @@ template struct ReciprocalFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using ReciprocalContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" @@ -50,6 +52,8 @@ namespace remainder namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct RemainderFunctor { static_assert(std::is_same_v); @@ -144,8 +148,8 @@ template struct RemainderFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using RemainderContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -393,8 +397,8 @@ template struct RemainderInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using RemainderInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp index 241f75c1bb..af30711142 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -29,9 +29,11 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -47,6 +49,7 @@ namespace round namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct RoundFunctor @@ -87,8 +90,8 @@ template struct RoundFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using RoundContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -51,6 +53,8 @@ namespace rsqrt namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct RsqrtFunctor { @@ -68,8 +72,8 @@ template struct RsqrtFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using RsqrtContigFunctor = elementwise_common::UnaryContigFunctor #include "cabs_impl.hpp" -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace sign namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -103,8 +106,8 @@ template struct SignFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using SignContigFunctor = elementwise_common::UnaryContigFunctor #include +#include "vec_size_util.hpp" + #include "kernels/dpctl_tensor_types.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -46,6 +49,7 @@ namespace signbit namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; using dpctl::tensor::type_utils::vec_cast; @@ -74,8 +78,8 @@ template struct SignbitFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using SignbitContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace sin namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct SinFunctor @@ -186,8 +189,8 @@ template struct SinFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using SinContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -48,6 +50,7 @@ namespace sinh namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct SinhFunctor @@ -155,8 +158,8 @@ template struct SinhFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using SinhContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -51,6 +53,7 @@ namespace sqrt namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct SqrtFunctor @@ -80,8 +83,8 @@ template struct SqrtFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using SqrtContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -97,8 +99,8 @@ template struct SquareFunctor template using SquareContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" @@ -49,6 +51,8 @@ namespace subtract namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct SubtractFunctor { @@ -85,8 +89,8 @@ template struct SubtractFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using SubtractContigFunctor = elementwise_common::BinaryContigFunctor struct SubtractInplaceFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using SubtractInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp index 4364d81fb7..6ef7a2449f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -30,10 +30,12 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -49,6 +51,7 @@ namespace tan namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct TanFunctor @@ -130,8 +133,8 @@ template struct TanFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using TanContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -50,6 +52,7 @@ namespace tanh namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct TanhFunctor @@ -124,8 +127,8 @@ template struct TanhFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using TanhContigFunctor = elementwise_common::UnaryContigFunctor #include -#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" +#include "vec_size_util.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" @@ -50,6 +52,8 @@ namespace true_divide namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; + template struct TrueDivideFunctor { @@ -112,8 +116,8 @@ struct TrueDivideFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor< argT1, @@ -473,8 +477,8 @@ struct TrueDivideInplaceTypeMapFactory template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using TrueDivideInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp index 55c8493880..a0ca221837 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -29,9 +29,11 @@ #include #include -#include "kernels/elementwise_functions/common.hpp" +#include "vec_size_util.hpp" #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/common.hpp" + #include "utils/offset_utils.hpp" #include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" @@ -47,6 +49,7 @@ namespace trunc namespace td_ns = dpctl::tensor::type_dispatch; +using dpctl::tensor::kernels::vec_size_utils::VecSize_v; using dpctl::tensor::type_utils::is_complex; template struct TruncFunctor @@ -75,8 +78,8 @@ template struct TruncFunctor template , + unsigned int n_vecs = 1, bool enable_sg_loadstore = true> using TruncContigFunctor = elementwise_common::UnaryContigFunctor + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace vec_size_utils +{ + +template struct VecSize +{ + static constexpr unsigned int value = + std::max(VecSize::value, VecSize::value); +}; + +template struct VecSize +{ + static constexpr unsigned int value = + 1 + ((sizeof(std::uint32_t) - 1) / (sizeof(T))); +}; + +template +static constexpr unsigned int VecSize_v = VecSize::value; + +} // end of namespace vec_size_utils +} // end of namespace kernels +} // end of namespace tensor +} // end of namespace dpctl