From 870daa129eba9c8ad29fcc918a8a778e669187b7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 21 Oct 2024 05:29:18 -0500
Subject: [PATCH 01/13] Add UL suffix literal integral value

---
 dpctl/tensor/libtensor/include/kernels/alignment.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
index ff4541af4d..9ec14dd027 100644
--- a/dpctl/tensor/libtensor/include/kernels/alignment.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
@@ -30,7 +30,7 @@ namespace kernels
 namespace alignment_utils
 {
 
-static constexpr size_t required_alignment = 64;
+static constexpr size_t required_alignment = 64UL;
 
 template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
 {

From 02a7c5ab2b68f71c93107553d8b499ddd89cf2e8 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 21 Oct 2024 05:26:18 -0500
Subject: [PATCH 02/13] Do not use sg.get_local_range

Use sg.get_max_local_range instead. The `sg.get_local_range` must perform
lots of checks to determine if this is the last trailing sub-group in the
work-group and its actual size may be smaller. We set the local work-group
size to be 128, which is a multiple of any sub-group size, and hence
get_local_range() always equals to get_max_local_raneg().

The size of the work-groups was increated from 128 to 256, which is
chosen so that all 8 threads of single vector with simd32 are used.

Set vec_sz and n_vecs in implementations of contig_impl for each support function

Make local work-groups size dependent on number of elements to process

Fixes for type dispatching utils

1. Add missing include <type_traits> needed for std::true_type, and
   std::disjunction, std::conjunction

2. Replace std::bool_constant<std::same_v<T1, T2>> with direct
   and simpler std::same<T1, T2> in couple of instances

Hide hyperparameter selection struct in anonymous namespace
---
 .../kernels/elementwise_functions/abs.hpp     |  33 ++-
 .../kernels/elementwise_functions/acos.hpp    |  33 ++-
 .../kernels/elementwise_functions/acosh.hpp   |  34 ++-
 .../kernels/elementwise_functions/add.hpp     |  64 ++++-
 .../kernels/elementwise_functions/angle.hpp   |  34 ++-
 .../kernels/elementwise_functions/asin.hpp    |  34 ++-
 .../kernels/elementwise_functions/asinh.hpp   |  34 ++-
 .../kernels/elementwise_functions/atan.hpp    |  37 ++-
 .../kernels/elementwise_functions/atan2.hpp   |  35 ++-
 .../kernels/elementwise_functions/atanh.hpp   |  34 ++-
 .../elementwise_functions/bitwise_and.hpp     |  48 +++-
 .../elementwise_functions/bitwise_invert.hpp  |  38 ++-
 .../bitwise_left_shift.hpp                    |  48 +++-
 .../elementwise_functions/bitwise_or.hpp      |  49 +++-
 .../bitwise_right_shift.hpp                   |  49 +++-
 .../elementwise_functions/bitwise_xor.hpp     |  49 +++-
 .../kernels/elementwise_functions/cbrt.hpp    |  32 ++-
 .../kernels/elementwise_functions/ceil.hpp    |  32 ++-
 .../kernels/elementwise_functions/common.hpp  | 257 +++++++++---------
 .../elementwise_functions/common_inplace.hpp  |  57 ++--
 .../kernels/elementwise_functions/conj.hpp    |  34 ++-
 .../elementwise_functions/copysign.hpp        |  36 ++-
 .../kernels/elementwise_functions/cos.hpp     |  34 ++-
 .../kernels/elementwise_functions/cosh.hpp    |  34 ++-
 .../kernels/elementwise_functions/equal.hpp   |  35 ++-
 .../kernels/elementwise_functions/exp.hpp     |  34 ++-
 .../kernels/elementwise_functions/exp2.hpp    |  34 ++-
 .../kernels/elementwise_functions/expm1.hpp   |  34 ++-
 .../kernels/elementwise_functions/floor.hpp   |  34 ++-
 .../elementwise_functions/floor_divide.hpp    |  49 +++-
 .../kernels/elementwise_functions/greater.hpp |  36 ++-
 .../elementwise_functions/greater_equal.hpp   |  37 ++-
 .../kernels/elementwise_functions/hypot.hpp   |  35 ++-
 .../kernels/elementwise_functions/imag.hpp    |  34 ++-
 .../elementwise_functions/isfinite.hpp        |  38 ++-
 .../kernels/elementwise_functions/isinf.hpp   |  33 ++-
 .../kernels/elementwise_functions/isnan.hpp   |  33 ++-
 .../kernels/elementwise_functions/less.hpp    |  38 ++-
 .../elementwise_functions/less_equal.hpp      |  36 ++-
 .../kernels/elementwise_functions/log.hpp     |  34 ++-
 .../kernels/elementwise_functions/log10.hpp   |  34 ++-
 .../kernels/elementwise_functions/log1p.hpp   |  34 ++-
 .../kernels/elementwise_functions/log2.hpp    |  34 ++-
 .../elementwise_functions/logaddexp.hpp       |  38 ++-
 .../elementwise_functions/logical_and.hpp     |  38 ++-
 .../elementwise_functions/logical_not.hpp     |  39 ++-
 .../elementwise_functions/logical_or.hpp      |  38 ++-
 .../elementwise_functions/logical_xor.hpp     |  38 ++-
 .../kernels/elementwise_functions/maximum.hpp |  38 ++-
 .../kernels/elementwise_functions/minimum.hpp |  38 ++-
 .../elementwise_functions/multiply.hpp        |  51 +++-
 .../elementwise_functions/negative.hpp        |  40 ++-
 .../elementwise_functions/nextafter.hpp       |  36 ++-
 .../elementwise_functions/not_equal.hpp       |  38 ++-
 .../elementwise_functions/positive.hpp        |  40 ++-
 .../kernels/elementwise_functions/pow.hpp     |  52 +++-
 .../kernels/elementwise_functions/proj.hpp    |  34 ++-
 .../kernels/elementwise_functions/real.hpp    |  34 ++-
 .../elementwise_functions/reciprocal.hpp      |  40 ++-
 .../elementwise_functions/remainder.hpp       |  51 +++-
 .../kernels/elementwise_functions/round.hpp   |  34 ++-
 .../kernels/elementwise_functions/rsqrt.hpp   |  34 ++-
 .../kernels/elementwise_functions/sign.hpp    |  34 ++-
 .../kernels/elementwise_functions/signbit.hpp |  35 ++-
 .../kernels/elementwise_functions/sin.hpp     |  34 ++-
 .../kernels/elementwise_functions/sinh.hpp    |  34 ++-
 .../kernels/elementwise_functions/sqrt.hpp    |  34 ++-
 .../kernels/elementwise_functions/square.hpp  |  36 ++-
 .../elementwise_functions/subtract.hpp        |  51 +++-
 .../kernels/elementwise_functions/tan.hpp     |  34 ++-
 .../kernels/elementwise_functions/tanh.hpp    |  34 ++-
 .../elementwise_functions/true_divide.hpp     |  51 +++-
 .../kernels/elementwise_functions/trunc.hpp   |  34 ++-
 .../elementwise_functions/vec_size_util.hpp   |  74 +++++
 .../include/utils/type_dispatch_building.hpp  |  11 +-
 75 files changed, 2534 insertions(+), 558 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 411040bada..c3c916d0c0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -32,9 +32,11 @@
 #include <type_traits>
 
 #include "cabs_impl.hpp"
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -89,8 +91,8 @@ template <typename argT, typename resT> struct AbsFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AbsContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -122,6 +124,24 @@ template <typename T> struct AbsOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy> struct AbsContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class abs_contig_kernel;
 
@@ -132,9 +152,12 @@ sycl::event abs_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AbsContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AbsContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AbsContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index a90f4e699f..6cc686ff46 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -128,8 +130,8 @@ template <typename argT, typename resT> struct AcosFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AcosContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -156,6 +158,24 @@ template <typename T> struct AcosOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy> struct AcosContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class acos_contig_kernel;
 
@@ -166,9 +186,12 @@ sycl::event acos_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AcosContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AcosContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AcosContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 8af3708427..4d5d5118d7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -155,8 +157,8 @@ template <typename argT, typename resT> struct AcoshFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AcoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -183,6 +185,25 @@ template <typename T> struct AcoshOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct AcoshContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class acosh_contig_kernel;
 
@@ -193,9 +214,12 @@ sycl::event acosh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AcoshContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AcoshContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AcoshContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index c06e98f3e5..3b25736168 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -30,6 +30,8 @@
 #include <type_traits>
 
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -110,8 +112,8 @@ template <typename argT1, typename argT2, typename resT> struct AddFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AddContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -196,6 +198,43 @@ template <typename T1, typename T2> struct AddOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2> struct AddContigHyperparameterSet
+{
+    using value_type = typename std::disjunction<
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int64_t,
+                                           argTy2,
+                                           std::int64_t,
+                                           1u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::uint64_t,
+                                           argTy2,
+                                           std::uint64_t,
+                                           1u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           double,
+                                           argTy2,
+                                           double,
+                                           1u,
+                                           2u>,
+        ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -214,10 +253,13 @@ sycl::event add_contig_impl(sycl::queue &exec_q,
                             ssize_t res_offset,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = AddContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr auto n_vecs = AddContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddContigFactory
@@ -410,8 +452,8 @@ template <typename argT, typename resT> struct AddInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
@@ -489,9 +531,13 @@ add_inplace_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = AddContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr auto n_vecs = AddContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+        argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddInplaceContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index 034b71438f..dfe9fb5063 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -74,8 +76,8 @@ template <typename argT, typename resT> struct AngleFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AngleContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -99,6 +101,25 @@ template <typename T> struct AngleOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct AngleContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class angle_contig_kernel;
 
@@ -109,9 +130,12 @@ sycl::event angle_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AngleContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AngleContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AngleContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 35c381aa84..d1dd66b577 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -148,8 +150,8 @@ template <typename argT, typename resT> struct AsinFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AsinContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -176,6 +178,25 @@ template <typename T> struct AsinOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct AsinContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class asin_contig_kernel;
 
@@ -186,9 +207,12 @@ sycl::event asin_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AsinContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AsinContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AsinContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index 7373dc39d5..fb38911dde 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -131,8 +133,8 @@ template <typename argT, typename resT> struct AsinhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AsinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -159,6 +161,25 @@ template <typename T> struct AsinhOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct AsinhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class asinh_contig_kernel;
 
@@ -169,9 +190,12 @@ sycl::event asinh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AsinhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AsinhContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AsinhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index fbba3fc436..aa260ce530 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,9 @@ namespace atan
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::ContigHyperparameterSetDefault;
+using dpctl::tensor::kernels::vec_size_utils::UnaryContigHyperparameterSetEntry;
+
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AtanFunctor
@@ -138,8 +143,8 @@ template <typename argT, typename resT> struct AtanFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AtanContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -166,6 +171,25 @@ template <typename T> struct AtanOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct AtanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class atan_contig_kernel;
 
@@ -176,9 +200,12 @@ sycl::event atan_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AtanContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AtanContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AtanContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index 1a694527dd..76a07f07da 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -68,8 +70,8 @@ template <typename argT1, typename argT2, typename resT> struct Atan2Functor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Atan2ContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -103,6 +105,25 @@ template <typename T1, typename T2> struct Atan2OutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2> struct Atan2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -121,10 +142,16 @@ sycl::event atan2_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        Atan2ContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        Atan2ContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor,
-        atan2_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                             arg2_offset, res_p, res_offset, depends);
+        atan2_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct Atan2ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 340e72b11c..563644b613 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -132,8 +134,8 @@ template <typename argT, typename resT> struct AtanhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AtanhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -160,6 +162,25 @@ template <typename T> struct AtanhOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct AtanhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class atanh_contig_kernel;
 
@@ -170,9 +191,12 @@ sycl::event atanh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = AtanhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec = AtanhContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AtanhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index da32b17183..0d8f9ad125 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -28,6 +28,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -91,8 +93,8 @@ struct BitwiseAndFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -160,6 +162,25 @@ template <typename T1, typename T2> struct BitwiseAndOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseAndContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+} // namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -179,10 +200,16 @@ bitwise_and_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseAndContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vec =
+        BitwiseAndContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor,
-        bitwise_and_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        bitwise_and_contig_kernel, vec_sz, n_vec>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct BitwiseAndContigFactory
@@ -290,8 +317,8 @@ template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseAndInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -361,10 +388,15 @@ bitwise_and_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseAndContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseAndContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseAndInplaceContigFunctor,
-        bitwise_and_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                           res_p, res_offset, depends);
+        bitwise_and_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index d6c1bc72db..86dadc9715 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -80,8 +82,8 @@ template <typename argT, typename resT> struct BitwiseInvertFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseInvertContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -115,6 +117,25 @@ template <typename argTy> struct BitwiseInvertOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct BitwiseInvertContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class bitwise_invert_contig_kernel;
 
@@ -126,10 +147,15 @@ bitwise_invert_contig_impl(sycl::queue &exec_q,
                            char *res_p,
                            const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, BitwiseInvertOutputType,
-                                                 BitwiseInvertContigFunctor,
-                                                 bitwise_invert_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    constexpr unsigned int vec_sz =
+        BitwiseInvertContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vec =
+        BitwiseInvertContigHyperparameterSet<argTy>::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor,
+        bitwise_invert_contig_kernel, vec_sz, n_vec>(exec_q, nelems, arg_p,
+                                                     res_p, depends);
 }
 
 template <typename fnT, typename T> struct BitwiseInvertContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index a987c8d604..67fe141484 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -100,8 +102,8 @@ struct BitwiseLeftShiftFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -169,6 +171,26 @@ template <typename T1, typename T2> struct BitwiseLeftShiftOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseLeftShiftContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -188,11 +210,16 @@ bitwise_left_shift_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseLeftShiftContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseLeftShiftContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseLeftShiftOutputType,
-        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+                res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -304,8 +331,8 @@ template <typename argT, typename resT> struct BitwiseLeftShiftInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseLeftShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -375,9 +402,14 @@ sycl::event bitwise_left_shift_inplace_contig_impl(
     ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseLeftShiftContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseLeftShiftContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseLeftShiftInplaceContigFunctor,
-        bitwise_left_shift_inplace_contig_kernel>(
+        bitwise_left_shift_inplace_contig_kernel, vec_sz, n_vecs>(
         exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index 71f3e809d9..03e2064dd2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -28,6 +28,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -90,8 +92,8 @@ template <typename argT1, typename argT2, typename resT> struct BitwiseOrFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -159,6 +161,26 @@ template <typename T1, typename T2> struct BitwiseOrOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseOrContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -177,10 +199,16 @@ sycl::event bitwise_or_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseOrContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseOrContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor,
-        bitwise_or_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                  arg2_offset, res_p, res_offset, depends);
+        bitwise_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct BitwiseOrContigFactory
@@ -286,8 +314,8 @@ template <typename argT, typename resT> struct BitwiseOrInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseOrInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -355,10 +383,15 @@ bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseOrContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseOrContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseOrInplaceContigFunctor,
-        bitwise_or_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                          res_p, res_offset, depends);
+        bitwise_or_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index e4dfee2ed6..497505e4aa 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -101,8 +103,8 @@ struct BitwiseRightShiftFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -171,6 +173,26 @@ template <typename T1, typename T2> struct BitwiseRightShiftOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseRightShiftContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -190,11 +212,16 @@ bitwise_right_shift_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseRightShiftContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseRightShiftContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseRightShiftOutputType,
-        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -308,8 +335,8 @@ template <typename argT, typename resT> struct BitwiseRightShiftInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseRightShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -379,9 +406,15 @@ sycl::event bitwise_right_shift_inplace_contig_impl(
     ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
+    // res = OP(res, arg)
+    constexpr unsigned int vec_sz =
+        BitwiseRightShiftContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseRightShiftContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseRightShiftInplaceContigFunctor,
-        bitwise_right_shift_inplace_contig_kernel>(
+        bitwise_right_shift_inplace_contig_kernel, vec_sz, n_vecs>(
         exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index d035b31170..87aab0519d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -28,6 +28,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -91,8 +93,8 @@ struct BitwiseXorFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -160,6 +162,26 @@ template <typename T1, typename T2> struct BitwiseXorOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseXorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -179,10 +201,16 @@ bitwise_xor_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseXorContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseXorContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor,
-        bitwise_xor_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        bitwise_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct BitwiseXorContigFactory
@@ -290,8 +318,8 @@ template <typename argT, typename resT> struct BitwiseXorInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseXorInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -361,10 +389,15 @@ bitwise_xor_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        BitwiseXorContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        BitwiseXorContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseXorInplaceContigFunctor,
-        bitwise_xor_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                           res_p, res_offset, depends);
+        bitwise_xor_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index 4f2634f17a..eb2ebb388b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/elementwise_functions/common.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
@@ -65,8 +67,8 @@ template <typename argT, typename resT> struct CbrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CbrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -91,6 +93,25 @@ template <typename T> struct CbrtOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct CbrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class cbrt_contig_kernel;
 
@@ -101,9 +122,12 @@ sycl::event cbrt_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = CbrtContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = CbrtContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CbrtContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
index 59bc630720..3edf0c3456 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/elementwise_functions/common.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
@@ -78,8 +80,8 @@ template <typename argT, typename resT> struct CeilFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CeilContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -112,6 +114,25 @@ template <typename T> struct CeilOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct CeilContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class ceil_contig_kernel;
 
@@ -122,9 +143,12 @@ sycl::event ceil_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = CeilContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = CeilContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CeilContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index ee955dcde5..c2bb1db23b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -52,8 +52,8 @@ using dpctl::tensor::kernels::alignment_utils::required_alignment;
 template <typename argT,
           typename resT,
           typename UnaryOperatorT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 struct UnaryContigFunctor
 {
@@ -70,9 +70,10 @@ struct UnaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz;
         UnaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
-        /* NOTE: vec_sz must divide sg.max_local_range()[0] */
+        /* NOTE: work-group size must be divisible by sub-group size */
 
         if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
         {
@@ -80,17 +81,15 @@ struct UnaryContigFunctor
             constexpr resT const_val = UnaryOperatorT::constant_value;
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
-            if (base + n_vecs * vec_sz * sgSize < nelems_ &&
-                max_sgSize == sgSize)
-            {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<resT, vec_sz> res_vec(const_val);
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto out_multi_ptr = sycl::address_space_cast<
@@ -101,30 +100,27 @@ struct UnaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = const_val;
                 }
             }
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryOperatorT::supports_sg_loadstore::value &&
-                           UnaryOperatorT::supports_vec::value)
+                           UnaryOperatorT::supports_vec::value && (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
-            std::uint16_t sgSize = sg.get_local_range()[0];
-            std::uint16_t max_sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * max_sgSize);
-            if (base + n_vecs * vec_sz * sgSize < nelems_ &&
-                sgSize == max_sgSize)
-            {
+                           sg.get_group_id()[0] * sgSize);
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> x;
 
 #pragma unroll
-                for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in_multi_ptr = sycl::address_space_cast<
@@ -140,9 +136,8 @@ struct UnaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     // scalar call
                     out[k] = op(in[k]);
                 }
@@ -155,19 +150,16 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * maxsgSize);
+                           sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (maxsgSize == sgSize))
-            {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in_multi_ptr = sycl::address_space_cast<
@@ -179,16 +171,15 @@ struct UnaryContigFunctor
 
                     arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
                         arg_vec[k] = op(arg_vec[k]);
                     }
                     sg.store<vec_sz>(out_multi_ptr, arg_vec);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in[k]);
                 }
             }
@@ -199,20 +190,17 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * maxsgSize);
+                           sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (maxsgSize == sgSize))
-            {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in_multi_ptr = sycl::address_space_cast<
@@ -224,27 +212,27 @@ struct UnaryContigFunctor
 
                     arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
                         res_vec[k] = op(arg_vec[k]);
                     }
                     sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in[k]);
                 }
             }
         }
         else {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            size_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
+            const size_t elems_per_sg = sgSize * elems_per_wi;
 
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            base = (base / sgSize) * elems_per_sg + (base % sgSize);
             for (size_t offset = base;
-                 offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz));
+                 offset < std::min(nelems_, base + elems_per_sg);
                  offset += sgSize)
             {
                 out[offset] = op(in[offset]);
@@ -281,6 +269,20 @@ struct UnaryStridedFunctor
     }
 };
 
+template <typename SizeT>
+SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
+{
+    // TODO: make the decision based on device descriptors
+
+    // constexpr SizeT few_threshold = (SizeT(1) << 17);
+    constexpr SizeT med_threshold = (SizeT(1) << 21);
+
+    const SizeT lws =
+        (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256));
+
+    return lws;
+}
+
 template <typename argTy,
           template <typename T>
           class UnaryOutputType,
@@ -292,32 +294,36 @@ template <typename argTy,
           class ContigFunctorT,
           template <typename A, typename R, unsigned int vs, unsigned int nv>
           class kernel_name,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u>
 sycl::event unary_contig_impl(sycl::queue &exec_q,
                               size_t nelems,
                               const char *arg_p,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
+    const size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    const size_t lws = select_lws(exec_q.get_device(), n_work_items_needed);
 
-        const size_t lws = 128;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
+    const size_t n_groups =
+        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
 
-        using resTy = typename UnaryOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+    using resTy = typename UnaryOutputType<argTy>::value_type;
+    using BaseKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
         if (is_aligned<required_alignment>(arg_p) &&
             is_aligned<required_alignment>(res_p))
         {
             constexpr bool enable_sg_loadstore = true;
-            using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using KernelName = BaseKernelName;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
@@ -326,9 +332,8 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
         }
         else {
             constexpr bool disable_sg_loadstore = false;
-            using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
             using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
@@ -336,6 +341,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
                                disable_sg_loadstore>(arg_tp, res_tp, nelems));
         }
     });
+
     return comp_ev;
 }
 
@@ -382,8 +388,8 @@ template <typename argT1,
           typename argT2,
           typename resT,
           typename BinaryOperatorT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 struct BinaryContigFunctor
 {
@@ -404,30 +410,29 @@ struct BinaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz;
         BinaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
 
         if constexpr (enable_sg_loadstore &&
                       BinaryOperatorT::supports_sg_loadstore::value &&
-                      BinaryOperatorT::supports_vec::value)
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in1_multi_ptr = sycl::address_space_cast<
@@ -447,9 +452,8 @@ struct BinaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in1[k], in2[k]);
                 }
             }
@@ -458,22 +462,19 @@ struct BinaryContigFunctor
                            BinaryOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in1_multi_ptr = sycl::address_space_cast<
@@ -497,20 +498,20 @@ struct BinaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in1[k], in2[k]);
                 }
             }
         }
         else {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
+            const size_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const size_t gid = ndit.get_global_linear_id();
+            const size_t elems_per_sg = sgSize * elems_per_wi;
 
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            const size_t base = (gid / sgSize) * elems_per_sg + (gid % sgSize);
             for (size_t offset = base;
-                 offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz));
+                 offset < std::min(nelems_, base + elems_per_sg);
                  offset += sgSize)
             {
                 out[offset] = op(in1[offset], in2[offset]);
@@ -582,14 +583,16 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        /* NOTE: work-group size must be divisible by sub-group size */
+
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
-        auto sg = ndit.get_sub_group();
-        size_t gid = ndit.get_global_linear_id();
+        const auto &sg = ndit.get_sub_group();
+        const size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_local_range()[0];
-        size_t base = gid - sg.get_local_id()[0];
+        const size_t sgSize = sg.get_max_local_range()[0];
+        const size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
             auto in1_multi_ptr = sycl::address_space_cast<
@@ -612,9 +615,8 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
             sg.store(out_multi_ptr, res_el);
         }
         else {
-            for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
-                 k += sgSize)
-            {
+            const size_t lane_id = sg.get_local_id()[0];
+            for (size_t k = base + lane_id; k < n_elems; k += sgSize) {
                 res[k] = op(mat[k], padded_vec[k % n1]);
             }
         }
@@ -647,14 +649,15 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        /* NOTE: work-group size must be divisible by sub-group size */
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
-        auto sg = ndit.get_sub_group();
+        const auto &sg = ndit.get_sub_group();
         size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_local_range()[0];
-        size_t base = gid - sg.get_local_id()[0];
+        const size_t sgSize = sg.get_max_local_range()[0];
+        const size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
             auto in1_multi_ptr = sycl::address_space_cast<
@@ -677,9 +680,8 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
             sg.store(out_multi_ptr, res_el);
         }
         else {
-            for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
-                 k += sgSize)
-            {
+            const size_t lane_id = sg.get_local_id()[0];
+            for (size_t k = base + lane_id; k < n_elems; k += sgSize) {
                 res[k] = op(padded_vec[k % n1], mat[k]);
             }
         }
@@ -775,8 +777,8 @@ template <typename argTy1,
                     unsigned int vs,
                     unsigned int nv>
           class kernel_name,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u>
 sycl::event binary_contig_impl(sycl::queue &exec_q,
                                size_t nelems,
                                const char *arg1_p,
@@ -787,30 +789,33 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
+    const size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    const size_t lws = select_lws(exec_q.get_device(), n_work_items_needed);
 
-        const size_t lws = 128;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
+    const size_t n_groups =
+        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
 
-        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+    using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+    using BaseKernelName = kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
 
-        const argTy1 *arg1_tp =
-            reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
-        const argTy2 *arg2_tp =
-            reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
+    const argTy1 *arg1_tp =
+        reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
+    const argTy2 *arg2_tp =
+        reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
         if (is_aligned<required_alignment>(arg1_tp) &&
             is_aligned<required_alignment>(arg2_tp) &&
             is_aligned<required_alignment>(res_tp))
         {
             constexpr bool enable_sg_loadstore = true;
-            using KernelName =
-                kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+            using KernelName = BaseKernelName;
+
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
                 BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
@@ -819,10 +824,8 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
         }
         else {
             constexpr bool disable_sg_loadstore = false;
-            using InnerKernelName =
-                kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
             using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
                 BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index cbb079e3c5..5fc0775c8d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -51,8 +51,8 @@ using dpctl::tensor::kernels::alignment_utils::required_alignment;
 template <typename argT,
           typename resT,
           typename BinaryInplaceOperatorT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 struct BinaryInplaceContigFunctor
 {
@@ -72,29 +72,29 @@ struct BinaryInplaceContigFunctor
     void operator()(sycl::nd_item<1> ndit) const
     {
         BinaryInplaceOperatorT op{};
+        constexpr std::uint32_t elems_per_wi = vec_sz * n_vecs;
         /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NB: Workgroup size must be divisible by sub-group size */
 
         if constexpr (enable_sg_loadstore &&
                       BinaryInplaceOperatorT::supports_sg_loadstore::value &&
-                      BinaryInplaceOperatorT::supports_vec::value)
+                      BinaryInplaceOperatorT::supports_vec::value &&
+                      (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
 
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     auto rhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
@@ -110,9 +110,8 @@ struct BinaryInplaceContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     op(lhs[k], rhs[k]);
                 }
             }
@@ -121,21 +120,18 @@ struct BinaryInplaceContigFunctor
                            BinaryInplaceOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     auto rhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
@@ -146,27 +142,27 @@ struct BinaryInplaceContigFunctor
                     arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
                     res_vec = sg.load<vec_sz>(lhs_multi_ptr);
 #pragma unroll
-                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                    for (std::uint32_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         op(res_vec[vec_id], arg_vec[vec_id]);
                     }
                     sg.store<vec_sz>(lhs_multi_ptr, res_vec);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     op(lhs[k], rhs[k]);
                 }
             }
         }
         else {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const size_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
+            const size_t elems_per_sg = elems_per_wi * sgSize;
 
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            base = (base / sgSize) * elems_per_sg + (base % sgSize);
             for (size_t offset = base;
-                 offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz));
+                 offset < std::min(nelems_, base + elems_per_sg);
                  offset += sgSize)
             {
                 op(lhs[offset], rhs[offset]);
@@ -228,13 +224,14 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        /* Workgroup size is expected to be a multiple of sub-group size */
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
         auto sg = ndit.get_sub_group();
         size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_local_range()[0];
+        std::uint8_t sgSize = sg.get_max_local_range()[0];
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
@@ -307,8 +304,8 @@ template <typename argTy,
           class BinaryInplaceContigFunctorT,
           template <typename T1, typename T2, unsigned int vs, unsigned int nv>
           class kernel_name,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u>
 sycl::event
 binary_inplace_contig_impl(sycl::queue &exec_q,
                            size_t nelems,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index 4953feedb2..b39a606108 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -82,8 +84,8 @@ template <typename argT, typename resT> struct ConjFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ConjContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -119,6 +121,25 @@ template <typename T> struct ConjOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct ConjContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class conj_contig_kernel;
 
@@ -129,9 +150,12 @@ sycl::event conj_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = ConjContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = ConjContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ConjContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
index 92997b572b..db469a41ca 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -82,8 +84,8 @@ template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CopysignContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -117,6 +119,26 @@ template <typename T1, typename T2> struct CopysignOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct CopysignContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -135,10 +157,16 @@ sycl::event copysign_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        CopysignContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        CopysignContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
-        copysign_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                arg2_offset, res_p, res_offset, depends);
+        copysign_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct CopysignContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index 8b6b0c5fbe..b98f177777 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -163,8 +165,8 @@ template <typename argT, typename resT> struct CosFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CosContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -192,6 +194,25 @@ template <typename T> struct CosOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct CosContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class cos_contig_kernel;
 
@@ -202,9 +223,12 @@ sycl::event cos_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = CosContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = CosContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CosContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index cff1038ed9..41bc33084c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -153,8 +155,8 @@ template <typename argT, typename resT> struct CoshFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -181,6 +183,25 @@ template <typename T> struct CoshOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct CoshContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class cosh_contig_kernel;
 
@@ -191,9 +212,12 @@ sycl::event cosh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = CoshContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = CoshContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CoshContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index d368658afc..044d6d00b3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -30,6 +30,8 @@
 #include <type_traits>
 
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -119,8 +121,8 @@ template <typename argT1, typename argT2, typename resT> struct EqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using EqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -189,6 +191,25 @@ template <typename T1, typename T2> struct EqualOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2> struct EqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -207,10 +228,16 @@ sycl::event equal_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        EqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        EqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, EqualOutputType, EqualContigFunctor,
-        equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                             arg2_offset, res_p, res_offset, depends);
+        equal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct EqualContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
index 7e613c9731..6eb8d13cf5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -122,8 +124,8 @@ template <typename argT, typename resT> struct ExpFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ExpContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -150,6 +152,25 @@ template <typename T> struct ExpOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct ExpContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class exp_contig_kernel;
 
@@ -160,9 +181,12 @@ sycl::event exp_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = ExpContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = ExpContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ExpContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
index b436bb3855..ed45ee45cd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -124,8 +126,8 @@ template <typename argT, typename resT> struct Exp2Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Exp2ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -152,6 +154,25 @@ template <typename T> struct Exp2OutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct Exp2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class exp2_contig_kernel;
 
@@ -162,9 +183,12 @@ sycl::event exp2_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = Exp2ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = Exp2ContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Exp2ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index 9a9d0a1562..87cbb70860 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -31,9 +31,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -136,8 +138,8 @@ template <typename argT, typename resT> struct Expm1Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Expm1ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -165,6 +167,25 @@ template <typename T> struct Expm1OutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct Expm1ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class expm1_contig_kernel;
 
@@ -175,9 +196,12 @@ sycl::event expm1_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = Expm1ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = Expm1ContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Expm1ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
index 530dd3d9aa..918f21133e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -29,9 +29,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -78,8 +80,8 @@ template <typename argT, typename resT> struct FloorFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using FloorContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -112,6 +114,25 @@ template <typename T> struct FloorOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct FloorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class floor_contig_kernel;
 
@@ -122,9 +143,12 @@ sycl::event floor_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = FloorContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = FloorContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct FloorContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 72ee3a789a..302631ff38 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -126,8 +128,8 @@ struct FloorDivideFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -201,6 +203,26 @@ template <typename T1, typename T2> struct FloorDivideOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct FloorDivideContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -220,10 +242,16 @@ floor_divide_contig_impl(sycl::queue &exec_q,
                          ssize_t res_offset,
                          const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        FloorDivideContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        FloorDivideContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor,
-        floor_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                    arg2_offset, res_p, res_offset, depends);
+        floor_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -367,8 +395,8 @@ template <typename argT, typename resT> struct FloorDivideInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using FloorDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -440,10 +468,15 @@ floor_divide_inplace_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        FloorDivideContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        FloorDivideContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, FloorDivideInplaceContigFunctor,
-        floor_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                            res_p, res_offset, depends);
+        floor_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index 05c2a36b0c..0191016988 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
@@ -120,8 +122,8 @@ template <typename argT1, typename argT2, typename resT> struct GreaterFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using GreaterContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -190,6 +192,26 @@ template <typename T1, typename T2> struct GreaterOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct GreaterContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -208,10 +230,16 @@ sycl::event greater_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        GreaterContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        GreaterContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, GreaterOutputType, GreaterContigFunctor,
-        greater_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                               arg2_offset, res_p, res_offset, depends);
+        greater_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct GreaterContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index 43e4e98db1..1ef95f59fe 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
@@ -121,8 +123,8 @@ struct GreaterEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -191,6 +193,26 @@ template <typename T1, typename T2> struct GreaterEqualOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct GreaterEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -210,11 +232,16 @@ greater_equal_contig_impl(sycl::queue &exec_q,
                           ssize_t res_offset,
                           const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        GreaterEqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        GreaterEqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor,
-        greater_equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset,
-                                     arg2_p, arg2_offset, res_p, res_offset,
-                                     depends);
+        greater_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
index c5b68644a9..0bb2b07151 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -84,8 +86,8 @@ template <typename argT1, typename argT2, typename resT> struct HypotFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using HypotContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -119,6 +121,25 @@ template <typename T1, typename T2> struct HypotOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2> struct HypotContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -137,10 +158,16 @@ sycl::event hypot_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        HypotContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        HypotContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, HypotOutputType, HypotContigFunctor,
-        hypot_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                             arg2_offset, res_p, res_offset, depends);
+        hypot_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct HypotContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
index e918bc0ac7..4e86278f27 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -31,9 +31,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -78,8 +80,8 @@ template <typename argT, typename resT> struct ImagFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ImagContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -115,6 +117,25 @@ template <typename T> struct ImagOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct ImagContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class imag_contig_kernel;
 
@@ -125,9 +146,12 @@ sycl::event imag_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = ImagContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = ImagContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ImagContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index df979eec76..2d444b3274 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -97,8 +99,8 @@ template <typename argT, typename resT> struct IsFiniteFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using IsFiniteContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -117,6 +119,25 @@ template <typename argTy> struct IsFiniteOutputType
     using value_type = bool;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct IsFiniteContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class isfinite_contig_kernel;
 
@@ -127,10 +148,15 @@ sycl::event isfinite_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, IsFiniteOutputType,
-                                                 IsFiniteContigFunctor,
-                                                 isfinite_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    constexpr unsigned int vec_sz =
+        IsFiniteContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        IsFiniteContigHyperparameterSet<argTy>::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsFiniteOutputType, IsFiniteContigFunctor,
+        isfinite_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
 }
 
 template <typename fnT, typename T> struct IsFiniteContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index 24be019a44..8db812bbed 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -30,7 +30,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -96,8 +99,8 @@ template <typename argT, typename resT> struct IsInfFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using IsInfContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -116,6 +119,25 @@ template <typename argTy> struct IsInfOutputType
     using value_type = bool;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct IsInfContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class isinf_contig_kernel;
 
@@ -126,9 +148,12 @@ sycl::event isinf_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = IsInfContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = IsInfContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct IsInfContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index cc452a25b1..512e1422a3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -29,7 +29,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -94,8 +97,8 @@ template <typename argT, typename resT> struct IsNanFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using IsNanContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -114,6 +117,25 @@ template <typename argTy> struct IsNanOutputType
     using value_type = bool;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct IsNanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class isnan_contig_kernel;
 
@@ -124,9 +146,12 @@ sycl::event isnan_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = IsNanContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = IsNanContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct IsNanContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index 0b26342563..e69d4ec257 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -118,8 +120,8 @@ template <typename argT1, typename argT2, typename resT> struct LessFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LessContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -188,6 +190,25 @@ template <typename T1, typename T2> struct LessOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2> struct LessContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -206,10 +227,15 @@ sycl::event less_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        LessContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        LessContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LessContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index 01289ae98f..08624e3a8e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
@@ -119,8 +121,8 @@ template <typename argT1, typename argT2, typename resT> struct LessEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -189,6 +191,26 @@ template <typename T1, typename T2> struct LessEqualOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LessEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -207,10 +229,16 @@ sycl::event less_equal_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        LessEqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        LessEqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor,
-        less_equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                  arg2_offset, res_p, res_offset, depends);
+        less_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LessEqualContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
index a3e28ef5d7..7020603250 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -78,8 +80,8 @@ template <typename argT, typename resT> struct LogFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -107,6 +109,25 @@ template <typename T> struct LogOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct LogContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class log_contig_kernel;
 
@@ -117,9 +138,12 @@ sycl::event log_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = LogContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = LogContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, LogOutputType, LogContigFunctor, log_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct LogContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
index 793b910f69..c7a4ac50bb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -97,8 +99,8 @@ template <typename argT, typename resT> struct Log10Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Log10ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -126,6 +128,25 @@ template <typename T> struct Log10OutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct Log10ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class log10_contig_kernel;
 
@@ -136,9 +157,12 @@ sycl::event log10_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = Log10ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = Log10ContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Log10ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
index 19238e7e37..aa72ed5262 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -30,9 +30,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -102,8 +104,8 @@ template <typename argT, typename resT> struct Log1pFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Log1pContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -131,6 +133,25 @@ template <typename T> struct Log1pOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct Log1pContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class log1p_contig_kernel;
 
@@ -141,9 +162,12 @@ sycl::event log1p_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = Log1pContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = Log1pContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Log1pContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
index 69d0022c72..18e5e42954 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -98,8 +100,8 @@ template <typename argT, typename resT> struct Log2Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Log2ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -127,6 +129,25 @@ template <typename T> struct Log2OutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct Log2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class log2_contig_kernel;
 
@@ -137,9 +158,12 @@ sycl::event log2_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = Log2ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = Log2ContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Log2ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index b0be45ea54..e191d2dd72 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -31,12 +31,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -99,8 +101,8 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -134,6 +136,26 @@ template <typename T1, typename T2> struct LogAddExpOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogAddExpContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -152,10 +174,16 @@ sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        LogAddExpContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        LogAddExpContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
-        logaddexp_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        logaddexp_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogAddExpContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
index f15caa02e6..ef01dc8a53 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -93,8 +95,8 @@ struct LogicalAndFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -159,6 +161,26 @@ template <typename T1, typename T2> struct LogicalAndOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalAndContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -178,10 +200,16 @@ logical_and_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        LogicalAndContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        LogicalAndContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor,
-        logical_and_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        logical_and_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogicalAndContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
index 7c83e07072..22e7aaa58a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -30,7 +30,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -66,8 +69,8 @@ template <typename argT, typename resT> struct LogicalNotFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalNotContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -89,6 +92,25 @@ template <typename argTy> struct LogicalNotOutputType
     using value_type = bool;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct LogicalNotContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class logical_not_contig_kernel;
 
@@ -100,10 +122,15 @@ logical_not_contig_impl(sycl::queue &exec_q,
                         char *res_p,
                         const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, LogicalNotOutputType,
-                                                 LogicalNotContigFunctor,
-                                                 logical_not_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    constexpr unsigned int vec_sz =
+        LogicalNotContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        LogicalNotContigHyperparameterSet<argTy>::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, LogicalNotOutputType, LogicalNotContigFunctor,
+        logical_not_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                   depends);
 }
 
 template <typename fnT, typename T> struct LogicalNotContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
index 43e02f2102..56baba9367 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -92,8 +94,8 @@ template <typename argT1, typename argT2, typename resT> struct LogicalOrFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -158,6 +160,26 @@ template <typename T1, typename T2> struct LogicalOrOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalOrContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -176,10 +198,16 @@ sycl::event logical_or_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        LogicalOrContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        LogicalOrContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor,
-        logical_or_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                  arg2_offset, res_p, res_offset, depends);
+        logical_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogicalOrContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
index dc41760985..2beb96777a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -94,8 +96,8 @@ struct LogicalXorFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -160,6 +162,26 @@ template <typename T1, typename T2> struct LogicalXorOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalXorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -179,10 +201,16 @@ logical_xor_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        LogicalXorContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        LogicalXorContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor,
-        logical_xor_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        logical_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogicalXorContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index e73704bad8..dbe82bcd92 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -96,8 +98,8 @@ template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MaximumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -182,6 +184,26 @@ template <typename T1, typename T2> struct MaximumOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MaximumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -200,10 +222,16 @@ sycl::event maximum_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        MaximumContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        MaximumContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MaximumOutputType, MaximumContigFunctor,
-        maximum_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                               arg2_offset, res_p, res_offset, depends);
+        maximum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct MaximumContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 590c0b6486..109a995e1b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -96,8 +98,8 @@ template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MinimumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -182,6 +184,26 @@ template <typename T1, typename T2> struct MinimumOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MinimumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -200,10 +222,16 @@ sycl::event minimum_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        MinimumContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        MinimumContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MinimumOutputType, MinimumContigFunctor,
-        minimum_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                               arg2_offset, res_p, res_offset, depends);
+        minimum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct MinimumContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index 1af284f55b..a684ba27fa 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -30,12 +30,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -98,8 +100,8 @@ template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MultiplyContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -184,6 +186,26 @@ template <typename T1, typename T2> struct MultiplyOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MultiplyContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -202,10 +224,16 @@ sycl::event multiply_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        MultiplyContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        MultiplyContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
-        multiply_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                arg2_offset, res_p, res_offset, depends);
+        multiply_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct MultiplyContigFactory
@@ -402,8 +430,8 @@ template <typename argT, typename resT> struct MultiplyInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MultiplyInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -482,10 +510,15 @@ multiply_inplace_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        MultiplyContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        MultiplyContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, MultiplyInplaceContigFunctor,
-        multiply_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                        res_p, res_offset, depends);
+        multiply_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index 83f17dd47b..54ce47a54e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -30,9 +30,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -65,8 +67,8 @@ template <typename argT, typename resT> struct NegativeFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using NegativeContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -97,6 +99,25 @@ template <typename T> struct NegativeOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct NegativeContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class negative_contig_kernel;
 
@@ -107,10 +128,15 @@ sycl::event negative_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, NegativeOutputType,
-                                                 NegativeContigFunctor,
-                                                 negative_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    constexpr unsigned int vec_sz =
+        NegativeContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        NegativeContigHyperparameterSet<argTy>::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, NegativeOutputType, NegativeContigFunctor,
+        negative_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
 }
 
 template <typename fnT, typename T> struct NegativeContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
index 5dc9ea40b3..f91df44868 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -82,8 +84,8 @@ template <typename argT1, typename argT2, typename resT> struct NextafterFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using NextafterContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -117,6 +119,26 @@ template <typename T1, typename T2> struct NextafterOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NextafterContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -135,10 +157,16 @@ sycl::event nextafter_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        NextafterContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        NextafterContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, NextafterOutputType, NextafterContigFunctor,
-        nextafter_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        nextafter_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct NextafterContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index c1b920193b..f5c6fb1fe5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -29,11 +29,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -103,8 +105,8 @@ template <typename argT1, typename argT2, typename resT> struct NotEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using NotEqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -173,6 +175,26 @@ template <typename T1, typename T2> struct NotEqualOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NotEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -191,10 +213,16 @@ sycl::event not_equal_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        NotEqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        NotEqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor,
-        not_equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        not_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct NotEqualContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index ae2711ed0e..3976792af8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -30,9 +30,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -80,8 +82,8 @@ template <typename argT, typename resT> struct PositiveFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using PositiveContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -112,6 +114,25 @@ template <typename T> struct PositiveOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct PositiveContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class positive_contig_kernel;
 
@@ -122,10 +143,15 @@ sycl::event positive_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, PositiveOutputType,
-                                                 PositiveContigFunctor,
-                                                 positive_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    constexpr unsigned int vec_sz =
+        PositiveContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        PositiveContigHyperparameterSet<argTy>::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, PositiveOutputType, PositiveContigFunctor,
+        positive_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
 }
 
 template <typename fnT, typename T> struct PositiveContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index bb462dceae..4f01215bd3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -30,12 +30,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -151,8 +153,8 @@ template <typename argT1, typename argT2, typename resT> struct PowFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using PowContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -236,6 +238,25 @@ template <typename T1, typename T2> struct PowOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2> struct PowContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -254,10 +275,15 @@ sycl::event pow_contig_impl(sycl::queue &exec_q,
                             ssize_t res_offset,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        PowContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        PowContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct PowContigFactory
@@ -417,8 +443,8 @@ template <typename argT, typename resT> struct PowInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
@@ -495,9 +521,15 @@ pow_inplace_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        PowContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        PowContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct PowInplaceContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
index 2c3dce0c9c..8fc5226c55 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -32,9 +32,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -91,8 +93,8 @@ template <typename argT, typename resT> struct ProjFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ProjContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -116,6 +118,25 @@ template <typename T> struct ProjOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct ProjContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class proj_contig_kernel;
 
@@ -126,9 +147,12 @@ sycl::event proj_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = ProjContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = ProjContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ProjContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
index c66e4003cb..9af7fec9e4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -31,9 +31,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -78,8 +80,8 @@ template <typename argT, typename resT> struct RealFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RealContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -115,6 +117,25 @@ template <typename T> struct RealOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct RealContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class real_contig_kernel;
 
@@ -125,9 +146,12 @@ sycl::event real_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = RealContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = RealContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, RealOutputType, RealContigFunctor, real_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct RealContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
index 4d4b70fd4f..7e314cb71d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -32,12 +32,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -81,8 +83,8 @@ template <typename argT, typename resT> struct ReciprocalFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ReciprocalContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -112,6 +114,25 @@ template <typename T> struct ReciprocalOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct ReciprocalContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class reciprocal_contig_kernel;
 
@@ -122,10 +143,15 @@ sycl::event reciprocal_contig_impl(sycl::queue &exec_q,
                                    char *res_p,
                                    const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, ReciprocalOutputType,
-                                                 ReciprocalContigFunctor,
-                                                 reciprocal_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    constexpr unsigned int vec_sz =
+        ReciprocalContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        ReciprocalContigHyperparameterSet<argTy>::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ReciprocalOutputType, ReciprocalContigFunctor,
+        reciprocal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                  depends);
 }
 
 template <typename fnT, typename T> struct ReciprocalContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 7bb070cc00..68e5c6134a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -144,8 +146,8 @@ template <typename argT1, typename argT2, typename resT> struct RemainderFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -219,6 +221,26 @@ template <typename T1, typename T2> struct RemainderOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct RemainderContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -237,10 +259,16 @@ sycl::event remainder_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        RemainderContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        RemainderContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, RemainderOutputType, RemainderContigFunctor,
-        remainder_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        remainder_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct RemainderContigFactory
@@ -393,8 +421,8 @@ template <typename argT, typename resT> struct RemainderInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RemainderInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -464,10 +492,15 @@ remainder_inplace_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        RemainderContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        RemainderContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, RemainderInplaceContigFunctor,
-        remainder_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                         res_p, res_offset, depends);
+        remainder_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index 241f75c1bb..a0358dbbba 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -29,9 +29,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -87,8 +89,8 @@ template <typename argT, typename resT> struct RoundFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RoundContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -123,6 +125,25 @@ template <typename T> struct RoundOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct RoundContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class round_contig_kernel;
 
@@ -133,9 +154,12 @@ sycl::event round_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = RoundContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = RoundContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct RoundContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index 61aafb13d9..ed8e6f16f7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -33,9 +33,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -68,8 +70,8 @@ template <typename argT, typename resT> struct RsqrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RsqrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -94,6 +96,25 @@ template <typename T> struct RsqrtOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct RsqrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class rsqrt_contig_kernel;
 
@@ -104,9 +125,12 @@ sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = RsqrtContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = RsqrtContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct RsqrtContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index 651f7d5d9a..c222b063a1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -31,9 +31,11 @@
 #include <type_traits>
 
 #include "cabs_impl.hpp"
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -103,8 +105,8 @@ template <typename argT, typename resT> struct SignFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SignContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -135,6 +137,25 @@ template <typename T> struct SignOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct SignContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class sign_contig_kernel;
 
@@ -145,9 +166,12 @@ sycl::event sign_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = SignContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = SignContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SignOutputType, SignContigFunctor, sign_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SignContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index e8ac7709ad..69e5cff29e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -30,7 +30,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -74,8 +77,8 @@ template <typename argT, typename resT> struct SignbitFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SignbitContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -100,6 +103,25 @@ template <typename argTy> struct SignbitOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct SignbitContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class signbit_contig_kernel;
 
@@ -110,9 +132,14 @@ sycl::event signbit_contig_impl(sycl::queue &exec_q,
                                 char *res_p,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        SignbitContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        SignbitContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SignbitContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index 8bc12097a8..f3fe1105b6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -186,8 +188,8 @@ template <typename argT, typename resT> struct SinFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SinContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -214,6 +216,25 @@ template <typename T> struct SinOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct SinContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class sin_contig_kernel;
 
@@ -224,9 +245,12 @@ sycl::event sin_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = SinContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = SinContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SinOutputType, SinContigFunctor, sin_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SinContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index e83626e56d..a7df2789de 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -155,8 +157,8 @@ template <typename argT, typename resT> struct SinhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -183,6 +185,25 @@ template <typename T> struct SinhOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct SinhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class sinh_contig_kernel;
 
@@ -193,9 +214,12 @@ sycl::event sinh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = SinhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = SinhContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SinhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 5adb41b20d..1bc5b25662 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -32,10 +32,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -80,8 +82,8 @@ template <typename argT, typename resT> struct SqrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SqrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -109,6 +111,25 @@ template <typename T> struct SqrtOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct SqrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class sqrt_contig_kernel;
 
@@ -119,9 +140,12 @@ sycl::event sqrt_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = SqrtContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = SqrtContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SqrtContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
index 4b096cc291..9135515091 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -97,8 +99,8 @@ template <typename argT, typename resT> struct SquareFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SquareContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -134,6 +136,25 @@ template <typename T> struct SquareOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct SquareContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class square_contig_kernel;
 
@@ -144,9 +165,14 @@ sycl::event square_contig_impl(sycl::queue &exec_q,
                                char *res_p,
                                const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        SquareContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        SquareContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SquareContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index 4ee3ae089b..6bfa1ffab0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -29,11 +29,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -85,8 +87,8 @@ template <typename argT1, typename argT2, typename resT> struct SubtractFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SubtractContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -170,6 +172,26 @@ template <typename T1, typename T2> struct SubtractOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct SubtractContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -188,10 +210,16 @@ sycl::event subtract_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        SubtractContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        SubtractContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
-        subtract_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                arg2_offset, res_p, res_offset, depends);
+        subtract_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct SubtractContigFactory
@@ -401,8 +429,8 @@ template <typename argT, typename resT> struct SubtractInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SubtractInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -480,10 +508,15 @@ subtract_inplace_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        SubtractContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        SubtractContigHyperparameterSet<resTy, argTy>::n_vecs;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, SubtractInplaceContigFunctor,
-        subtract_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                        res_p, res_offset, depends);
+        subtract_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index 4364d81fb7..17d423466e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -130,8 +132,8 @@ template <typename argT, typename resT> struct TanFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TanContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -158,6 +160,25 @@ template <typename T> struct TanOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct TanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class tan_contig_kernel;
 
@@ -168,9 +189,12 @@ sycl::event tan_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = TanContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = TanContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, TanOutputType, TanContigFunctor, tan_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct TanContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index 0af4e4e628..a64313c0ca 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -124,8 +126,8 @@ template <typename argT, typename resT> struct TanhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TanhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -152,6 +154,25 @@ template <typename T> struct TanhOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct TanhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class tanh_contig_kernel;
 
@@ -162,9 +183,12 @@ sycl::event tanh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = TanhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = TanhContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct TanhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 53db1e163c..0f429f662b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -112,8 +114,8 @@ struct TrueDivideFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -177,6 +179,26 @@ template <typename T1, typename T2> struct TrueDivideOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct TrueDivideContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -196,10 +218,16 @@ true_divide_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        TrueDivideContigHyperparameterSet<argTy1, argTy2>::vec_sz;
+    constexpr unsigned int n_vecs =
+        TrueDivideContigHyperparameterSet<argTy1, argTy2>::n_vecs;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor,
-        true_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        true_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct TrueDivideContigFactory
@@ -473,8 +501,8 @@ struct TrueDivideInplaceTypeMapFactory
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TrueDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -509,10 +537,15 @@ true_divide_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz =
+        TrueDivideContigHyperparameterSet<resTy, argTy>::vec_sz;
+    constexpr unsigned int n_vecs =
+        TrueDivideContigHyperparameterSet<resTy, argTy>::vec_sz;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, TrueDivideInplaceContigFunctor,
-        true_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                           res_p, res_offset, depends);
+        true_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
index 55c8493880..b014959f9f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -29,9 +29,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -75,8 +77,8 @@ template <typename argT, typename resT> struct TruncFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 4u,
+          unsigned int n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TruncContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -109,6 +111,25 @@ template <typename T> struct TruncOutputType
     static constexpr bool is_defined = !std::is_same_v<value_type, void>;
 };
 
+namespace
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy> struct TruncContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of anonymous namespace
+
 template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class trunc_contig_kernel;
 
@@ -119,9 +140,12 @@ sycl::event trunc_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr unsigned int vec_sz = TruncContigHyperparameterSet<argTy>::vec_sz;
+    constexpr unsigned int n_vecs = TruncContigHyperparameterSet<argTy>::n_vecs;
+
     return elementwise_common::unary_contig_impl<
-        argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct TruncContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
new file mode 100644
index 0000000000..66eb437f43
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -0,0 +1,74 @@
+//=== tanh.hpp -   Unary function TANH                     ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines utilities for dispatching elementwise dedicated kernels
+//  for contiguous inputs.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace vec_size_utils
+{
+
+template <typename Ty1,
+          typename ArgTy1,
+          typename Ty2,
+          typename ArgTy2,
+          unsigned int vec_sz_v,
+          unsigned int n_vecs_v>
+struct BinaryContigHyperparameterSetEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
+{
+    static constexpr unsigned int vec_sz = vec_sz_v;
+    static constexpr unsigned int n_vecs = n_vecs_v;
+};
+
+template <typename Ty,
+          typename ArgTy,
+          unsigned int vec_sz_v,
+          unsigned int n_vecs_v>
+struct UnaryContigHyperparameterSetEntry : std::is_same<Ty, ArgTy>
+{
+    static constexpr unsigned int vec_sz = vec_sz_v;
+    static constexpr unsigned int n_vecs = n_vecs_v;
+};
+
+template <unsigned int vec_sz_v, unsigned int n_vecs_v>
+struct ContigHyperparameterSetDefault : std::true_type
+{
+    static constexpr unsigned int vec_sz = vec_sz_v;
+    static constexpr unsigned int n_vecs = n_vecs_v;
+};
+
+} // end of namespace vec_size_utils
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
index a9f3a0c876..1cd378f83e 100644
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -26,6 +26,8 @@
 #pragma once
 
 #include <complex>
+#include <type_traits>
+
 #include <sycl/sycl.hpp>
 
 namespace dpctl
@@ -161,7 +163,7 @@ class DispatchVectorBuilder
 
 /*! @brief struct to define result_type typename for Ty == ArgTy */
 template <typename Ty, typename ArgTy, typename ResTy = ArgTy>
-struct TypeMapResultEntry : std::bool_constant<std::is_same_v<Ty, ArgTy>>
+struct TypeMapResultEntry : std::is_same<Ty, ArgTy>
 {
     using result_type = ResTy;
 };
@@ -174,8 +176,7 @@ template <typename Ty1,
           typename ArgTy2,
           typename ResTy>
 struct BinaryTypeMapResultEntry
-    : std::bool_constant<std::conjunction_v<std::is_same<Ty1, ArgTy1>,
-                                            std::is_same<Ty2, ArgTy2>>>
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
 {
     using result_type = ResTy;
 };
@@ -272,8 +273,8 @@ template <typename FunPtrT> struct NullPtrTable
 };
 
 template <typename Ty1, typename ArgTy, typename Ty2, typename outTy>
-struct TypePairDefinedEntry : std::bool_constant<std::is_same_v<Ty1, ArgTy> &&
-                                                 std::is_same_v<Ty2, outTy>>
+struct TypePairDefinedEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy>, std::is_same<Ty2, outTy>>
 {
     static constexpr bool is_defined = true;
 };

From 19837bf5a6e9ece1d3a2233b0a85da1b303153fb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Oct 2024 21:58:20 -0500
Subject: [PATCH 03/13] Fix for test failure on AMD CPU.

vec operator should also apply isnan for sycl::half
---
 .../kernels/elementwise_functions/maximum.hpp | 24 +++++++++++++------
 .../kernels/elementwise_functions/minimum.hpp | 24 +++++++++++++------
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index dbe82bcd92..374a2bbceb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -72,9 +72,13 @@ template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
                            std::is_same_v<argT1, sycl::half>)
-            return (std::isnan(in1) || in1 > in2) ? in1 : in2;
-        else
+        {
+            const bool choose_first = (std::isnan(in1) || (in1 > in2));
+            return (choose_first) ? in1 : in2;
+        }
+        else {
             return (in1 > in2) ? in1 : in2;
+        }
     }
 
     template <int vec_sz>
@@ -85,11 +89,17 @@ template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
         sycl::vec<resT, vec_sz> res;
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            if constexpr (std::is_floating_point_v<argT1>)
-                res[i] =
-                    (sycl::isnan(in1[i]) || in1[i] > in2[i]) ? in1[i] : in2[i];
-            else
-                res[i] = (in1[i] > in2[i]) ? in1[i] : in2[i];
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = (std::isnan(v1) || (v1 > v2));
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 > v2) ? v1 : v2;
+            }
         }
         return res;
     }
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 109a995e1b..274f1b8f89 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -72,9 +72,13 @@ template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
                            std::is_same_v<argT1, sycl::half>)
-            return (std::isnan(in1) || in1 < in2) ? in1 : in2;
-        else
+        {
+            const bool choose_first = sycl::isnan(in1) || (in1 < in2);
+            return (choose_first) ? in1 : in2;
+        }
+        else {
             return (in1 < in2) ? in1 : in2;
+        }
     }
 
     template <int vec_sz>
@@ -85,11 +89,17 @@ template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
         sycl::vec<resT, vec_sz> res;
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            if constexpr (std::is_floating_point_v<argT1>)
-                res[i] =
-                    (sycl::isnan(in1[i]) || in1[i] < in2[i]) ? in1[i] : in2[i];
-            else
-                res[i] = (in1[i] < in2[i]) ? in1[i] : in2[i];
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = sycl::isnan(v1) || (v1 < v2);
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 < v2) ? v1 : v2;
+            }
         }
         return res;
     }

From 6748a18493857465f3f34842ae607113f2f02d46 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 12 Nov 2024 13:32:42 -0800
Subject: [PATCH 04/13] More specializations for hyperparameters of
 add_contig_impl

---
 .../kernels/elementwise_functions/add.hpp     | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 3b25736168..9e434bd1b9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -209,17 +209,35 @@ using vsu_ns::ContigHyperparameterSetDefault;
 template <typename argTy1, typename argTy2> struct AddContigHyperparameterSet
 {
     using value_type = typename std::disjunction<
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int32_t,
+                                           argTy2,
+                                           std::int32_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int32_t,
+                                           argTy2,
+                                           std::int32_t,
+                                           2u,
+                                           2u>,
         BinaryContigHyperparameterSetEntry<argTy1,
                                            std::int64_t,
                                            argTy2,
                                            std::int64_t,
-                                           1u,
+                                           2u,
                                            2u>,
         BinaryContigHyperparameterSetEntry<argTy1,
                                            std::uint64_t,
                                            argTy2,
                                            std::uint64_t,
-                                           1u,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           float,
+                                           argTy2,
+                                           float,
+                                           2u,
                                            2u>,
         BinaryContigHyperparameterSetEntry<argTy1,
                                            double,

From 040af7aa80c1897dd6f6b74c02f3e6318570adeb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 12 Nov 2024 15:13:54 -0600
Subject: [PATCH 05/13] cabs_impl function changed to use select over branching

---
 .../elementwise_functions/cabs_impl.hpp       | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
index e61304bed8..fc42d2d4ba 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
@@ -57,21 +57,16 @@ template <typename realT> realT cabs(std::complex<realT> const &z)
     constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
     constexpr realT p_inf = std::numeric_limits<realT>::infinity();
 
-    if (std::isinf(x)) {
-        return p_inf;
-    }
-    else if (std::isinf(y)) {
-        return p_inf;
-    }
-    else if (std::isnan(x)) {
-        return q_nan;
-    }
-    else if (std::isnan(y)) {
-        return q_nan;
-    }
-    else {
-        return exprm_ns::abs(exprm_ns::complex<realT>(z));
-    }
+    const realT res =
+        std::isinf(x)
+            ? p_inf
+            : ((std::isinf(y)
+                    ? p_inf
+                    : ((std::isnan(x)
+                            ? q_nan
+                            : exprm_ns::abs(exprm_ns::complex<realT>(z))))));
+
+    return res;
 }
 
 } // namespace detail

From 840ac6932fe1c8efb8517552695e4d4326a0b7a9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 12 Nov 2024 15:53:33 -0600
Subject: [PATCH 06/13] Add changelog entry for tuning of elementwise functions

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5523c684ca..ec4b346c74 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * Fix additional warnings when generating docs [gh-1861](https://github.com/IntelPython/dpctl/pull/1861)
 * Add missing include of SYCL header to "math_utils.hpp" [gh-1899](https://github.com/IntelPython/dpctl/pull/1899)
 * Add support of CV-qualifiers in `is_complex<T>` helper [gh-1900](https://github.com/IntelPython/dpctl/pull/1900)
+* Tuning work for elementwise functions with modest performance gains (under 10%) [gh-1889](https://github.com/IntelPython/dpctl/pull/1889)
 
 ## [0.18.1] - Oct. 11, 2024
 

From 281cb7b9fc5c8a84213f07adc8903c6302fac0de Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 12 Nov 2024 14:46:14 -0800
Subject: [PATCH 07/13] Corrected text in the license/description header

---
 .../kernels/elementwise_functions/vec_size_util.hpp        | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
index 66eb437f43..b827db28b9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -1,5 +1,4 @@
-//=== tanh.hpp -   Unary function TANH                     ------
-//*-C++-*--/===//
+//=== vec_size_utils.hpp -                            -------/ /*-C++-*--/===//
 //
 //                      Data Parallel Control (dpctl)
 //
@@ -20,8 +19,8 @@
 //===---------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines utilities for dispatching elementwise dedicated kernels
-//  for contiguous inputs.
+/// This file defines utilities for selection of hyperparameters for kernels
+/// implementing unary and binary elementwise functions for contiguous inputs
 //===---------------------------------------------------------------------===//
 
 #pragma once

From 041f528095c25e1bdc2733f04a734e29618d567e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 18 Nov 2024 10:06:00 -0600
Subject: [PATCH 08/13] Introduce SYCL utilities sub_group_load/sub_group_store

This would resolve compiler warnings about deprecated sub_group::load,
sub_group::store methods. (Warnings in build with nightly SYCLOS DPC++
bundle should be fixed now).

Additionally, replaced unsigned int type for template parameters with
std::uint8_t
---
 .../include/kernels/copy_and_cast.hpp         |  67 ++---
 .../include/kernels/copy_as_contiguous.hpp    | 112 +++++----
 .../kernels/elementwise_functions/abs.hpp     |  10 +-
 .../kernels/elementwise_functions/acos.hpp    |  10 +-
 .../kernels/elementwise_functions/acosh.hpp   |  10 +-
 .../kernels/elementwise_functions/add.hpp     |  16 +-
 .../kernels/elementwise_functions/angle.hpp   |  10 +-
 .../kernels/elementwise_functions/asin.hpp    |  10 +-
 .../kernels/elementwise_functions/asinh.hpp   |  10 +-
 .../kernels/elementwise_functions/atan.hpp    |  10 +-
 .../kernels/elementwise_functions/atan2.hpp   |  12 +-
 .../kernels/elementwise_functions/atanh.hpp   |  10 +-
 .../elementwise_functions/bitwise_and.hpp     |  24 +-
 .../elementwise_functions/bitwise_invert.hpp  |  10 +-
 .../bitwise_left_shift.hpp                    |  24 +-
 .../elementwise_functions/bitwise_or.hpp      |  24 +-
 .../bitwise_right_shift.hpp                   |  24 +-
 .../elementwise_functions/bitwise_xor.hpp     |  24 +-
 .../kernels/elementwise_functions/cbrt.hpp    |  10 +-
 .../kernels/elementwise_functions/ceil.hpp    |  10 +-
 .../kernels/elementwise_functions/common.hpp  | 233 +++++++++---------
 .../elementwise_functions/common_inplace.hpp  | 107 ++++----
 .../kernels/elementwise_functions/conj.hpp    |  10 +-
 .../elementwise_functions/copysign.hpp        |  12 +-
 .../kernels/elementwise_functions/cos.hpp     |  10 +-
 .../kernels/elementwise_functions/cosh.hpp    |  10 +-
 .../kernels/elementwise_functions/equal.hpp   |  12 +-
 .../kernels/elementwise_functions/exp.hpp     |  10 +-
 .../kernels/elementwise_functions/exp2.hpp    |  10 +-
 .../kernels/elementwise_functions/expm1.hpp   |  10 +-
 .../kernels/elementwise_functions/floor.hpp   |  10 +-
 .../elementwise_functions/floor_divide.hpp    |  24 +-
 .../kernels/elementwise_functions/greater.hpp |  12 +-
 .../elementwise_functions/greater_equal.hpp   |  12 +-
 .../kernels/elementwise_functions/hypot.hpp   |  12 +-
 .../kernels/elementwise_functions/imag.hpp    |  10 +-
 .../elementwise_functions/isfinite.hpp        |  10 +-
 .../kernels/elementwise_functions/isinf.hpp   |  10 +-
 .../kernels/elementwise_functions/isnan.hpp   |  10 +-
 .../kernels/elementwise_functions/less.hpp    |  12 +-
 .../elementwise_functions/less_equal.hpp      |  12 +-
 .../kernels/elementwise_functions/log.hpp     |  10 +-
 .../kernels/elementwise_functions/log10.hpp   |  10 +-
 .../kernels/elementwise_functions/log1p.hpp   |  10 +-
 .../kernels/elementwise_functions/log2.hpp    |  10 +-
 .../elementwise_functions/logaddexp.hpp       |  12 +-
 .../elementwise_functions/logical_and.hpp     |  12 +-
 .../elementwise_functions/logical_not.hpp     |  10 +-
 .../elementwise_functions/logical_or.hpp      |  12 +-
 .../elementwise_functions/logical_xor.hpp     |  12 +-
 .../kernels/elementwise_functions/maximum.hpp |  12 +-
 .../kernels/elementwise_functions/minimum.hpp |  12 +-
 .../elementwise_functions/multiply.hpp        |  24 +-
 .../elementwise_functions/negative.hpp        |  10 +-
 .../elementwise_functions/nextafter.hpp       |  12 +-
 .../elementwise_functions/not_equal.hpp       |  12 +-
 .../elementwise_functions/positive.hpp        |  10 +-
 .../kernels/elementwise_functions/pow.hpp     |  24 +-
 .../kernels/elementwise_functions/proj.hpp    |  10 +-
 .../kernels/elementwise_functions/real.hpp    |  10 +-
 .../elementwise_functions/reciprocal.hpp      |  10 +-
 .../elementwise_functions/remainder.hpp       |  24 +-
 .../kernels/elementwise_functions/round.hpp   |  10 +-
 .../kernels/elementwise_functions/rsqrt.hpp   |  10 +-
 .../kernels/elementwise_functions/sign.hpp    |  10 +-
 .../kernels/elementwise_functions/signbit.hpp |  10 +-
 .../kernels/elementwise_functions/sin.hpp     |  10 +-
 .../kernels/elementwise_functions/sinh.hpp    |  10 +-
 .../kernels/elementwise_functions/sqrt.hpp    |  10 +-
 .../kernels/elementwise_functions/square.hpp  |  10 +-
 .../elementwise_functions/subtract.hpp        |  24 +-
 .../kernels/elementwise_functions/tan.hpp     |  10 +-
 .../kernels/elementwise_functions/tanh.hpp    |  10 +-
 .../elementwise_functions/true_divide.hpp     |  24 +-
 .../kernels/elementwise_functions/trunc.hpp   |  10 +-
 .../elementwise_functions/vec_size_util.hpp   |  22 +-
 .../libtensor/include/utils/sycl_utils.hpp    |  66 +++++
 77 files changed, 784 insertions(+), 725 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index f48a5a287e..a4e7fceca1 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -31,6 +31,7 @@
 #include "dpctl_tensor_types.hpp"
 #include "kernels/alignment.hpp"
 #include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
 namespace dpctl
@@ -49,13 +50,16 @@ using dpctl::tensor::kernels::alignment_utils::
 using dpctl::tensor::kernels::alignment_utils::is_aligned;
 using dpctl::tensor::kernels::alignment_utils::required_alignment;
 
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
 template <typename srcT, typename dstT, typename IndexerT>
 class copy_cast_generic_kernel;
 
 template <typename srcT,
           typename dstT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class copy_cast_contig_kernel;
 
 template <typename srcT, typename dstT, typename IndexerT>
@@ -207,8 +211,8 @@ template <typename fnT, typename D, typename S> struct CopyAndCastGenericFactory
 template <typename srcT,
           typename dstT,
           typename CastFnT,
-          int vec_sz = 4,
-          int n_vecs = 2,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 class ContigCopyFunctor
 {
@@ -227,58 +231,55 @@ class ContigCopyFunctor
     {
         CastFnT fn{};
 
+        constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
                       is_complex<dstT>::value)
         {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
-
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
-            for (size_t offset = base;
-                 offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
-                 offset += sgSize)
-            {
+            std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * elems_per_sg + (gid % sgSize)
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const size_t end = std::min(nelems, start + elems_per_sg);
+            for (size_t offset = start; offset < end; offset += sgSize) {
                 dst_p[offset] = fn(src_p[offset]);
             }
         }
         else {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * max_sgSize);
-
-            if (base + n_vecs * vec_sz * sgSize < nelems &&
-                sgSize == max_sgSize)
-            {
-                sycl::vec<srcT, vec_sz> src_vec;
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems) {
                 sycl::vec<dstT, vec_sz> dst_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto src_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(
-                        &src_p[base + it * sgSize]);
+                        sycl::access::decorated::yes>(&src_p[offset]);
                     auto dst_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(
-                        &dst_p[base + it * sgSize]);
+                        sycl::access::decorated::yes>(&dst_p[offset]);
 
-                    src_vec = sg.load<vec_sz>(src_multi_ptr);
+                    const sycl::vec<srcT, vec_sz> src_vec =
+                        sub_group_load<vec_sz>(sg, src_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; k++) {
                         dst_vec[k] = fn(src_vec[k]);
                     }
-                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems;
-                     k += sgSize)
-                {
+                const size_t start = base + sg.get_local_id()[0];
+                for (size_t k = start; k < nelems; k += sgSize) {
                     dst_p[k] = fn(src_p[k]);
                 }
             }
@@ -332,8 +333,8 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
         dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
 
         size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
+        constexpr std::uint32_t vec_sz = 4;
+        constexpr std::uint32_t n_vecs = 2;
         const size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index c71e487572..7b95016db3 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -44,8 +44,8 @@ namespace copy_as_contig
 
 template <typename T,
           typename IndexerT,
-          std::uint32_t vec_sz = 4u,
-          std::uint32_t n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 class CopyAsCContigFunctor
 {
@@ -68,10 +68,8 @@ class CopyAsCContigFunctor
     {
         static_assert(vec_sz > 0);
         static_assert(n_vecs > 0);
-        static_assert(vec_sz * n_vecs < (std::uint32_t(1) << 8));
 
-        constexpr std::uint8_t elems_per_wi =
-            static_cast<std::uint8_t>(vec_sz * n_vecs);
+        constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
@@ -79,14 +77,14 @@ class CopyAsCContigFunctor
                 ndit.get_sub_group().get_local_range()[0];
             const std::size_t gid = ndit.get_global_linear_id();
 
-            // base = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
+            // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
             // gid % sgSize == gid - (gid / sgSize) * sgSize
-            const std::size_t elems_per_sg = sgSize * (elems_per_wi - 1);
-            const std::size_t base = (gid / sgSize) * elems_per_sg + gid;
-            const std::size_t offset_max =
-                std::min(nelems, base + sgSize * elems_per_wi);
+            const std::size_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
 
-            for (size_t offset = base; offset < offset_max; offset += sgSize) {
+            for (size_t offset = start; offset < end; offset += sgSize) {
                 auto src_offset = src_indexer(offset);
                 dst_p[offset] = src_p[src_offset];
             }
@@ -132,8 +130,8 @@ class CopyAsCContigFunctor
 
 template <typename T,
           typename IndexerT,
-          std::uint32_t vec_sz,
-          std::uint32_t n_vecs,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
           bool enable_sg_load,
           typename KernelName>
 sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
@@ -145,7 +143,6 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
 {
     static_assert(vec_sz > 0);
     static_assert(n_vecs > 0);
-    static_assert(vec_sz * n_vecs < (std::uint32_t(1) << 8));
 
     constexpr std::size_t preferred_lws = 256;
 
@@ -187,8 +184,8 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
 
 template <typename T,
           typename IndexT,
-          std::uint32_t vec_sz,
-          std::uint32_t n_vecs,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
           bool enable_sgload>
 class as_contig_krn;
 
@@ -210,8 +207,8 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
     using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
     const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
 
-    constexpr std::uint32_t vec_sz = 4u;
-    constexpr std::uint32_t n_vecs = 2u;
+    constexpr std::uint8_t vec_sz = 4u;
+    constexpr std::uint8_t n_vecs = 2u;
 
     using dpctl::tensor::kernels::alignment_utils::
         disabled_sg_loadstore_wrapper_krn;
@@ -256,8 +253,8 @@ template <typename fnT, typename T> struct AsCContigFactory
 
 template <typename T,
           typename IndexerT,
-          std::uint32_t tile_size,
-          std::uint32_t n_lines>
+          std::uint16_t tile_size,
+          std::uint16_t n_lines>
 class as_contig_batch_of_square_matrices_krn;
 
 namespace detail
@@ -283,14 +280,14 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
     const T *src_tp = reinterpret_cast<const T *>(src_p);
     T *dst_tp = reinterpret_cast<T *>(dst_p);
 
-    constexpr std::uint32_t private_tile_size = 4;
-    constexpr std::uint32_t n_lines = 2;
-    constexpr std::uint32_t block_size =
+    constexpr std::uint16_t private_tile_size = 4;
+    constexpr std::uint16_t n_lines = 2;
+    constexpr std::uint16_t block_size =
         n_lines * private_tile_size * private_tile_size;
 
-    constexpr std::uint32_t lws0 = block_size;
-    constexpr std::uint32_t lws1 = n_lines;
-    constexpr std::uint32_t nelems_per_wi = (block_size / lws1);
+    constexpr std::uint16_t lws0 = block_size;
+    constexpr std::uint16_t lws1 = n_lines;
+    constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
 
     static_assert(nelems_per_wi * lws1 == block_size);
     static_assert(nelems_per_wi == private_tile_size * private_tile_size);
@@ -377,40 +374,41 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
             std::array<T, nelems_per_wi> private_block_01 = {T(0)};
             std::array<T, nelems_per_wi> private_block_10 = {T(0)};
 
-            // 0 <= lid_lin < lws0 * lws1 == (block_size * block_size /
-            // nelems_per_wi) == (block_size/private_tile_size)**2
-            constexpr std::uint32_t n_private_tiles_per_axis =
+            // 0 <= lid_lin < lws0 * lws1 ==
+            //       (block_size * block_size / nelems_per_wi) ==
+            //       (block_size/private_tile_size)**2
+            constexpr std::uint16_t n_private_tiles_per_axis =
                 block_size / private_tile_size;
-            const std::uint32_t local_tile_id0 =
+            const std::uint16_t local_tile_id0 =
                 lid_lin / n_private_tiles_per_axis;
-            const std::uint32_t local_tile_id1 =
+            const std::uint16_t local_tile_id1 =
                 lid_lin - local_tile_id0 * n_private_tiles_per_axis;
 
             if (local_tile_id0 <= local_tile_id1) {
-                for (std::uint32_t pr_i0 = 0; pr_i0 < private_tile_size;
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
                      ++pr_i0)
                 {
-                    for (std::uint32_t pr_i1 = 0; pr_i1 < private_tile_size;
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
                          ++pr_i1)
                     {
-                        const std::uint32_t t0_offset =
+                        const std::uint16_t t0_offset =
                             local_tile_id0 * private_tile_size;
-                        const std::uint32_t t1_offset =
+                        const std::uint16_t t1_offset =
                             local_tile_id1 * private_tile_size;
 
-                        const std::uint32_t pr_offset =
+                        const std::uint16_t pr_offset =
                             pr_i1 * private_tile_size + pr_i0;
-                        const std::uint32_t rel_offset =
+                        const std::uint16_t rel_offset =
                             pr_i0 + pr_i1 * block_size;
 
                         // read (local_tile_id0, local_tile_id1)
-                        const std::uint32_t local_01_offset =
+                        const std::uint16_t local_01_offset =
                             (t0_offset + t1_offset * block_size) + rel_offset;
                         private_block_01[pr_offset] =
                             local_block[local_01_offset];
 
                         // read (local_tile_id1, local_tile_id0)
-                        const std::uint32_t local_10_offset =
+                        const std::uint16_t local_10_offset =
                             (t1_offset + t0_offset * block_size) + rel_offset;
                         private_block_10[pr_offset] =
                             local_block[local_10_offset];
@@ -422,20 +420,20 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
                                 sycl::memory_scope::work_group);
 
             if (local_tile_id0 <= local_tile_id1) {
-                for (std::uint32_t pr_i0 = 0; pr_i0 < private_tile_size;
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
                      ++pr_i0)
                 {
-                    for (std::uint32_t pr_i1 = 0; pr_i1 < private_tile_size;
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
                          ++pr_i1)
                     {
-                        const std::uint32_t t0_offset =
+                        const std::uint16_t t0_offset =
                             local_tile_id0 * private_tile_size;
-                        const std::uint32_t t1_offset =
+                        const std::uint16_t t1_offset =
                             local_tile_id1 * private_tile_size;
-                        const std::uint32_t pr_offset =
+                        const std::uint16_t pr_offset =
                             pr_i0 * private_tile_size + pr_i1;
 
-                        const std::uint32_t rel_offset =
+                        const std::uint16_t rel_offset =
                             pr_i0 + pr_i1 * block_size;
 
                         // write back permuted private blocks
@@ -444,7 +442,7 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
                         local_block[local_01_offset] =
                             private_block_10[pr_offset];
 
-                        const std::uint32_t local_10_offset =
+                        const std::uint16_t local_10_offset =
                             (t1_offset + t0_offset * block_size) + rel_offset;
                         local_block[local_10_offset] =
                             private_block_01[pr_offset];
@@ -461,8 +459,8 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
             const std::size_t dst_tile_start1 = src_tile_start1;
 
             if (local_dim0 == block_size && local_dim1 == block_size) {
-                const std::uint32_t dst_i0 = src_i1;
-                const std::uint32_t dst_i1 = src_i0;
+                const std::uint16_t dst_i0 = src_i1;
+                const std::uint16_t dst_i1 = src_i0;
 
                 const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
                 const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
@@ -471,11 +469,11 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
                     dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
                 const std::size_t pr_step_dst = lws1 * dst_stride;
 
-                const std::uint32_t _local_offset0 =
+                const std::uint16_t _local_offset0 =
                     dst_i0 * block_size + dst_i1;
-                const std::uint32_t _pr_step_local = lws1 * block_size;
+                const std::uint16_t _pr_step_local = lws1 * block_size;
 
-                for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
                     if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) {
                         dst_tp[dst_offset0 + pr_step_dst * pr_id] =
                             local_block[_local_offset0 +
@@ -485,24 +483,24 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
             }
             else {
                 // map local_linear_id into (local_dim0, local_dim1)
-                for (std::uint32_t el_id = lid_lin;
+                for (std::uint16_t el_id = lid_lin;
                      el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
                 {
 
                     // 0 <= local_i0 < local_dim0
-                    const std::uint32_t loc_i0 = el_id / local_dim1;
+                    const std::uint16_t loc_i0 = el_id / local_dim1;
                     // 0 <= local_i1 < local_dim1
-                    const std::uint32_t loc_i1 = el_id - loc_i0 * local_dim1;
+                    const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1;
 
-                    const std::uint32_t dst_i0 = loc_i0;
-                    const std::uint32_t dst_i1 = loc_i1;
+                    const std::uint16_t dst_i0 = loc_i0;
+                    const std::uint16_t dst_i1 = loc_i1;
 
                     const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
                     const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
 
                     const std::size_t dst_offset =
                         dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
-                    const std::uint32_t local_offset =
+                    const std::uint16_t local_offset =
                         loc_i0 * block_size + loc_i1;
 
                     if ((dst_gid1 < n) && (dst_gid0 < n)) {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index c3c916d0c0..0dd315fc9d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -91,8 +91,8 @@ template <typename argT, typename resT> struct AbsFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AbsContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -142,7 +142,7 @@ template <typename argTy> struct AbsContigHyperparameterSet
 
 } // namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class abs_contig_kernel;
 
 template <typename argTy>
@@ -152,8 +152,8 @@ sycl::event abs_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AbsContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AbsContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AbsContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AbsContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index 6cc686ff46..47c69d5190 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -130,8 +130,8 @@ template <typename argT, typename resT> struct AcosFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AcosContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -176,7 +176,7 @@ template <typename argTy> struct AcosContigHyperparameterSet
 
 } // namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class acos_contig_kernel;
 
 template <typename argTy>
@@ -186,8 +186,8 @@ sycl::event acos_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AcosContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AcosContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AcosContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AcosContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 4d5d5118d7..f199be5a7e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -157,8 +157,8 @@ template <typename argT, typename resT> struct AcoshFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AcoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -204,7 +204,7 @@ template <typename argTy> struct AcoshContigHyperparameterSet
 
 } // namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class acosh_contig_kernel;
 
 template <typename argTy>
@@ -214,8 +214,8 @@ sycl::event acosh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AcoshContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AcoshContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AcoshContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AcoshContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 9e434bd1b9..69f63b53c0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -112,8 +112,8 @@ template <typename argT1, typename argT2, typename resT> struct AddFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AddContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -256,8 +256,8 @@ template <typename argTy1, typename argTy2> struct AddContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class add_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -470,8 +470,8 @@ template <typename argT, typename resT> struct AddInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
@@ -491,8 +491,8 @@ using AddInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class add_inplace_contig_kernel;
 
 /* @brief Types supported by in-place add */
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index dfe9fb5063..670b9c10f8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -76,8 +76,8 @@ template <typename argT, typename resT> struct AngleFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AngleContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -120,7 +120,7 @@ template <typename argTy> struct AngleContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class angle_contig_kernel;
 
 template <typename argTy>
@@ -130,8 +130,8 @@ sycl::event angle_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AngleContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AngleContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AngleContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AngleContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index d1dd66b577..db7ec5723e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -150,8 +150,8 @@ template <typename argT, typename resT> struct AsinFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AsinContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -197,7 +197,7 @@ template <typename argTy> struct AsinContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class asin_contig_kernel;
 
 template <typename argTy>
@@ -207,8 +207,8 @@ sycl::event asin_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AsinContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AsinContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AsinContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AsinContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index fb38911dde..9b58d7ad19 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -133,8 +133,8 @@ template <typename argT, typename resT> struct AsinhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AsinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -180,7 +180,7 @@ template <typename argTy> struct AsinhContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class asinh_contig_kernel;
 
 template <typename argTy>
@@ -190,8 +190,8 @@ sycl::event asinh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AsinhContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AsinhContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AsinhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AsinhContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index aa260ce530..3f96f95526 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -143,8 +143,8 @@ template <typename argT, typename resT> struct AtanFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AtanContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -190,7 +190,7 @@ template <typename argTy> struct AtanContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class atan_contig_kernel;
 
 template <typename argTy>
@@ -200,8 +200,8 @@ sycl::event atan_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AtanContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AtanContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AtanContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AtanContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index 76a07f07da..37bd66fb54 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -70,8 +70,8 @@ template <typename argT1, typename argT2, typename resT> struct Atan2Functor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Atan2ContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -127,8 +127,8 @@ template <typename argTy1, typename argTy2> struct Atan2ContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class atan2_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -142,9 +142,9 @@ sycl::event atan2_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         Atan2ContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         Atan2ContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 563644b613..25c15ef614 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -134,8 +134,8 @@ template <typename argT, typename resT> struct AtanhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using AtanhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -181,7 +181,7 @@ template <typename argTy> struct AtanhContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class atanh_contig_kernel;
 
 template <typename argTy>
@@ -191,8 +191,8 @@ sycl::event atanh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = AtanhContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec = AtanhContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = AtanhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vec = AtanhContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index 0d8f9ad125..45a03c913d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -93,8 +93,8 @@ struct BitwiseAndFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -184,8 +184,8 @@ struct BitwiseAndContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_and_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -200,9 +200,9 @@ bitwise_and_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseAndContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vec =
+    constexpr std::uint8_t n_vec =
         BitwiseAndContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -317,8 +317,8 @@ template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseAndInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -339,8 +339,8 @@ using BitwiseAndInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_and_inplace_contig_kernel;
 
 /* @brief Types supported by in-place bitwise AND */
@@ -388,9 +388,9 @@ bitwise_and_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseAndContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseAndContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index 86dadc9715..582da57c29 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -82,8 +82,8 @@ template <typename argT, typename resT> struct BitwiseInvertFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseInvertContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -136,7 +136,7 @@ template <typename argTy> struct BitwiseInvertContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class bitwise_invert_contig_kernel;
 
 template <typename argTy>
@@ -147,9 +147,9 @@ bitwise_invert_contig_impl(sycl::queue &exec_q,
                            char *res_p,
                            const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseInvertContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vec =
+    constexpr std::uint8_t n_vec =
         BitwiseInvertContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index 67fe141484..8cb0dcc9d0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -102,8 +102,8 @@ struct BitwiseLeftShiftFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -194,8 +194,8 @@ struct BitwiseLeftShiftContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_left_shift_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -210,9 +210,9 @@ bitwise_left_shift_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseLeftShiftContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseLeftShiftContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -331,8 +331,8 @@ template <typename argT, typename resT> struct BitwiseLeftShiftInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseLeftShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -353,8 +353,8 @@ using BitwiseLeftShiftInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_left_shift_inplace_contig_kernel;
 
 /* @brief Types supported by in-place bitwise left shift */
@@ -402,9 +402,9 @@ sycl::event bitwise_left_shift_inplace_contig_impl(
     ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseLeftShiftContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseLeftShiftContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index 03e2064dd2..e1de5be474 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -92,8 +92,8 @@ template <typename argT1, typename argT2, typename resT> struct BitwiseOrFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -184,8 +184,8 @@ struct BitwiseOrContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_or_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -199,9 +199,9 @@ sycl::event bitwise_or_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseOrContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseOrContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -314,8 +314,8 @@ template <typename argT, typename resT> struct BitwiseOrInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseOrInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -336,8 +336,8 @@ using BitwiseOrInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_or_inplace_contig_kernel;
 
 /* @brief Types supported by in-place bitwise OR */
@@ -383,9 +383,9 @@ bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseOrContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseOrContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 497505e4aa..35d3352c41 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -103,8 +103,8 @@ struct BitwiseRightShiftFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -196,8 +196,8 @@ struct BitwiseRightShiftContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_right_shift_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -212,9 +212,9 @@ bitwise_right_shift_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseRightShiftContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseRightShiftContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -335,8 +335,8 @@ template <typename argT, typename resT> struct BitwiseRightShiftInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseRightShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -357,8 +357,8 @@ using BitwiseRightShiftInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_right_shift_inplace_contig_kernel;
 
 /* @brief Types supported by in-place bitwise right shift */
@@ -407,9 +407,9 @@ sycl::event bitwise_right_shift_inplace_contig_impl(
     const std::vector<sycl::event> &depends = {})
 {
     // res = OP(res, arg)
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseRightShiftContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseRightShiftContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index 87aab0519d..fb18128cc1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -93,8 +93,8 @@ struct BitwiseXorFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -185,8 +185,8 @@ struct BitwiseXorContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_xor_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -201,9 +201,9 @@ bitwise_xor_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseXorContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseXorContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -318,8 +318,8 @@ template <typename argT, typename resT> struct BitwiseXorInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using BitwiseXorInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -340,8 +340,8 @@ using BitwiseXorInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class bitwise_xor_inplace_contig_kernel;
 
 /* @brief Types supported by in-place bitwise XOR */
@@ -389,9 +389,9 @@ bitwise_xor_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         BitwiseXorContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         BitwiseXorContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index eb2ebb388b..a071558a5f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -67,8 +67,8 @@ template <typename argT, typename resT> struct CbrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CbrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -112,7 +112,7 @@ template <typename argTy> struct CbrtContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class cbrt_contig_kernel;
 
 template <typename argTy>
@@ -122,8 +122,8 @@ sycl::event cbrt_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = CbrtContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = CbrtContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = CbrtContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = CbrtContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
index 3edf0c3456..ab7610088f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -80,8 +80,8 @@ template <typename argT, typename resT> struct CeilFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CeilContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -133,7 +133,7 @@ template <typename argTy> struct CeilContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class ceil_contig_kernel;
 
 template <typename argTy>
@@ -143,8 +143,8 @@ sycl::event ceil_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = CeilContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = CeilContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = CeilContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = CeilContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index c2bb1db23b..7efd4b02ee 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -33,6 +33,7 @@
 #include "kernels/dpctl_tensor_types.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
 
 namespace dpctl
 {
@@ -48,12 +49,15 @@ using dpctl::tensor::kernels::alignment_utils::
 using dpctl::tensor::kernels::alignment_utils::is_aligned;
 using dpctl::tensor::kernels::alignment_utils::required_alignment;
 
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
 /*! @brief Functor for unary function evaluation on contiguous array */
 template <typename argT,
           typename resT,
           typename UnaryOperatorT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 struct UnaryContigFunctor
 {
@@ -70,7 +74,7 @@ struct UnaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz;
+        constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
         UnaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
@@ -81,22 +85,21 @@ struct UnaryContigFunctor
             constexpr resT const_val = UnaryOperatorT::constant_value;
 
             auto sg = ndit.get_sub_group();
-            std::uint32_t sgSize = sg.get_max_local_range()[0];
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
-                sycl::vec<resT, vec_sz> res_vec(const_val);
+            const size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
+                constexpr sycl::vec<resT, vec_sz> res_vec(const_val);
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    size_t offset = base + static_cast<size_t>(it) *
-                                               static_cast<size_t>(sgSize);
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto out_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&out[offset]);
 
-                    sg.store<vec_sz>(out_multi_ptr, res_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
                 }
             }
             else {
@@ -111,18 +114,15 @@ struct UnaryContigFunctor
                            UnaryOperatorT::supports_vec::value && (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
-            std::uint32_t sgSize = sg.get_max_local_range()[0];
-
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
-                sycl::vec<argT, vec_sz> x;
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
+            const size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    size_t offset = base + static_cast<size_t>(it) *
-                                               static_cast<size_t>(sgSize);
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto in_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&in[offset]);
@@ -130,9 +130,10 @@ struct UnaryContigFunctor
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&out[offset]);
 
-                    x = sg.load<vec_sz>(in_multi_ptr);
-                    sycl::vec<resT, vec_sz> res_vec = op(x);
-                    sg.store<vec_sz>(out_multi_ptr, res_vec);
+                    const sycl::vec<argT, vec_sz> x =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    const sycl::vec<resT, vec_sz> res_vec = op(x);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
                 }
             }
             else {
@@ -150,18 +151,15 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint32_t sgSize = sg.get_max_local_range()[0];
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
-
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
-                sycl::vec<argT, vec_sz> arg_vec;
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
 
+            if (base + elems_per_wi * sgSize < nelems_) {
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    size_t offset = base + static_cast<size_t>(it) *
-                                               static_cast<size_t>(sgSize);
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto in_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&in[offset]);
@@ -169,12 +167,13 @@ struct UnaryContigFunctor
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&out[offset]);
 
-                    arg_vec = sg.load<vec_sz>(in_multi_ptr);
+                    sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
 #pragma unroll
                     for (std::uint32_t k = 0; k < vec_sz; ++k) {
                         arg_vec[k] = op(arg_vec[k]);
                     }
-                    sg.store<vec_sz>(out_multi_ptr, arg_vec);
+                    sub_group_store<vec_sz>(sg, arg_vec, out_multi_ptr);
                 }
             }
             else {
@@ -190,19 +189,15 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint32_t sgSize = sg.get_max_local_range()[0];
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
-
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
-                sycl::vec<argT, vec_sz> arg_vec;
-                sycl::vec<resT, vec_sz> res_vec;
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
 
+            if (base + elems_per_wi * sgSize < nelems_) {
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    size_t offset = base + static_cast<size_t>(it) *
-                                               static_cast<size_t>(sgSize);
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto in_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&in[offset]);
@@ -210,12 +205,14 @@ struct UnaryContigFunctor
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&out[offset]);
 
-                    arg_vec = sg.load<vec_sz>(in_multi_ptr);
+                    const sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec;
 #pragma unroll
-                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         res_vec[k] = op(arg_vec[k]);
                     }
-                    sg.store<vec_sz>(out_multi_ptr, res_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
                 }
             }
             else {
@@ -226,15 +223,14 @@ struct UnaryContigFunctor
             }
         }
         else {
-            size_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
-            const size_t elems_per_sg = sgSize * elems_per_wi;
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const size_t gid = ndit.get_global_linear_id();
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
 
-            base = (base / sgSize) * elems_per_sg + (base % sgSize);
-            for (size_t offset = base;
-                 offset < std::min(nelems_, base + elems_per_sg);
-                 offset += sgSize)
-            {
+            const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const size_t end = std::min(nelems_, start + elems_per_sg);
+            for (size_t offset = start; offset < end; offset += sgSize) {
                 out[offset] = op(in[offset]);
             }
         }
@@ -288,25 +284,26 @@ template <typename argTy,
           class UnaryOutputType,
           template <typename A,
                     typename R,
-                    unsigned int vs,
-                    unsigned int nv,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
                     bool enable>
           class ContigFunctorT,
-          template <typename A, typename R, unsigned int vs, unsigned int nv>
+          template <typename A, typename R, std::uint8_t vs, std::uint8_t nv>
           class kernel_name,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u>
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
 sycl::event unary_contig_impl(sycl::queue &exec_q,
                               size_t nelems,
                               const char *arg_p,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    const size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+    const size_t n_work_items_needed = nelems / elems_per_wi;
     const size_t lws = select_lws(exec_q.get_device(), n_work_items_needed);
 
     const size_t n_groups =
-        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
     const auto gws_range = sycl::range<1>(n_groups * lws);
     const auto lws_range = sycl::range<1>(lws);
 
@@ -388,8 +385,8 @@ template <typename argT1,
           typename argT2,
           typename resT,
           typename BinaryOperatorT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 struct BinaryContigFunctor
 {
@@ -410,7 +407,7 @@ struct BinaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz;
+        constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
         BinaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
@@ -422,19 +419,16 @@ struct BinaryContigFunctor
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
+            const size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
 
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
-                sycl::vec<argT1, vec_sz> arg1_vec;
-                sycl::vec<argT2, vec_sz> arg2_vec;
+            if (base + elems_per_wi * sgSize < nelems_) {
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    size_t offset = base + static_cast<size_t>(it) *
-                                               static_cast<size_t>(sgSize);
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    size_t offset = base + it * sgSize;
                     auto in1_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&in1[offset]);
@@ -445,10 +439,12 @@ struct BinaryContigFunctor
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&out[offset]);
 
-                    arg1_vec = sg.load<vec_sz>(in1_multi_ptr);
-                    arg2_vec = sg.load<vec_sz>(in2_multi_ptr);
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
                     res_vec = op(arg1_vec, arg2_vec);
-                    sg.store<vec_sz>(out_multi_ptr, res_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
                 }
             }
             else {
@@ -462,21 +458,16 @@ struct BinaryContigFunctor
                            BinaryOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_max_local_range()[0];
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
-
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
-                sycl::vec<argT1, vec_sz> arg1_vec;
-                sycl::vec<argT2, vec_sz> arg2_vec;
-                sycl::vec<resT, vec_sz> res_vec;
+            const size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
 
+            if (base + elems_per_wi * sgSize < nelems_) {
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    size_t offset = base + static_cast<size_t>(it) *
-                                               static_cast<size_t>(sgSize);
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto in1_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&in1[offset]);
@@ -487,14 +478,18 @@ struct BinaryContigFunctor
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&out[offset]);
 
-                    arg1_vec = sg.load<vec_sz>(in1_multi_ptr);
-                    arg2_vec = sg.load<vec_sz>(in2_multi_ptr);
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
+
+                    sycl::vec<resT, vec_sz> res_vec;
 #pragma unroll
                     for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         res_vec[vec_id] =
                             op(arg1_vec[vec_id], arg2_vec[vec_id]);
                     }
-                    sg.store<vec_sz>(out_multi_ptr, res_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
                 }
             }
             else {
@@ -509,11 +504,9 @@ struct BinaryContigFunctor
             const size_t gid = ndit.get_global_linear_id();
             const size_t elems_per_sg = sgSize * elems_per_wi;
 
-            const size_t base = (gid / sgSize) * elems_per_sg + (gid % sgSize);
-            for (size_t offset = base;
-                 offset < std::min(nelems_, base + elems_per_sg);
-                 offset += sgSize)
-            {
+            const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const size_t end = std::min(nelems_, start + elems_per_sg);
+            for (size_t offset = start; offset < end; offset += sgSize) {
                 out[offset] = op(in1[offset], in2[offset]);
             }
         }
@@ -607,12 +600,12 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
                 sycl::access::address_space::global_space,
                 sycl::access::decorated::yes>(&res[base]);
 
-            const argT1 mat_el = sg.load(in1_multi_ptr);
-            const argT2 vec_el = sg.load(in2_multi_ptr);
+            const argT1 mat_el = sub_group_load(sg, in1_multi_ptr);
+            const argT2 vec_el = sub_group_load(sg, in2_multi_ptr);
 
             resT res_el = op(mat_el, vec_el);
 
-            sg.store(out_multi_ptr, res_el);
+            sub_group_store(sg, res_el, out_multi_ptr);
         }
         else {
             const size_t lane_id = sg.get_local_id()[0];
@@ -672,12 +665,12 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
                 sycl::access::address_space::global_space,
                 sycl::access::decorated::yes>(&res[base]);
 
-            const argT2 mat_el = sg.load(in2_multi_ptr);
-            const argT1 vec_el = sg.load(in1_multi_ptr);
+            const argT2 mat_el = sub_group_load(sg, in2_multi_ptr);
+            const argT1 vec_el = sub_group_load(sg, in1_multi_ptr);
 
             resT res_el = op(vec_el, mat_el);
 
-            sg.store(out_multi_ptr, res_el);
+            sub_group_store(sg, res_el, out_multi_ptr);
         }
         else {
             const size_t lane_id = sg.get_local_id()[0];
@@ -767,18 +760,18 @@ template <typename argTy1,
           template <typename T1,
                     typename T2,
                     typename T3,
-                    unsigned int vs,
-                    unsigned int nv,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
                     bool enable_sg_loadstore>
           class BinaryContigFunctorT,
           template <typename T1,
                     typename T2,
                     typename T3,
-                    unsigned int vs,
-                    unsigned int nv>
+                    std::uint8_t vs,
+                    std::uint8_t nv>
           class kernel_name,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u>
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
 sycl::event binary_contig_impl(sycl::queue &exec_q,
                                size_t nelems,
                                const char *arg1_p,
@@ -928,10 +921,10 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
 
     // sub-group spans work-items [I, I + sgSize)
     // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sg.load( &mat[base]) may load arrays from
+    // Generically, sub_group_load( &mat[base]) may load arrays from
     // different rows of mat. The start corresponds to row (base / n0)
-    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
-    // ensure that reads are accessible
+    // We read sub_group_load(&padded_vec[(base / n0)]).
+    // The vector is padded to ensure that reads are accessible
 
     const size_t lws = 128;
 
@@ -1009,10 +1002,10 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
 
     // sub-group spans work-items [I, I + sgSize)
     // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sg.load( &mat[base]) may load arrays from
+    // Generically, sub_group_load( &mat[base]) may load arrays from
     // different rows of mat. The start corresponds to row (base / n0)
-    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
-    // ensure that reads are accessible
+    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
+    // padded to ensure that reads are accessible
 
     const size_t lws = 128;
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 5fc0775c8d..e3bf906484 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -33,6 +33,7 @@
 #include "kernels/dpctl_tensor_types.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
 
 namespace dpctl
 {
@@ -48,11 +49,14 @@ using dpctl::tensor::kernels::alignment_utils::
 using dpctl::tensor::kernels::alignment_utils::is_aligned;
 using dpctl::tensor::kernels::alignment_utils::required_alignment;
 
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
 template <typename argT,
           typename resT,
           typename BinaryInplaceOperatorT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 struct BinaryInplaceContigFunctor
 {
@@ -72,7 +76,7 @@ struct BinaryInplaceContigFunctor
     void operator()(sycl::nd_item<1> ndit) const
     {
         BinaryInplaceOperatorT op{};
-        constexpr std::uint32_t elems_per_wi = vec_sz * n_vecs;
+        constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NB: Workgroup size must be divisible by sub-group size */
 
@@ -82,31 +86,31 @@ struct BinaryInplaceContigFunctor
                       (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_max_local_range()[0];
-
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
 
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
+            size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
 
-                sycl::vec<argT, vec_sz> arg_vec;
-                sycl::vec<resT, vec_sz> res_vec;
+            if (base + elems_per_wi * sgSize < nelems_) {
 
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto rhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
+                        sycl::access::decorated::yes>(&rhs[offset]);
                     auto lhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&lhs[base + it * sgSize]);
+                        sycl::access::decorated::yes>(&lhs[offset]);
 
-                    arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
-                    res_vec = sg.load<vec_sz>(lhs_multi_ptr);
+                    const sycl::vec<argT, vec_sz> &arg_vec =
+                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec =
+                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
                     op(res_vec, arg_vec);
 
-                    sg.store<vec_sz>(lhs_multi_ptr, res_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
                 }
             }
             else {
@@ -120,32 +124,32 @@ struct BinaryInplaceContigFunctor
                            BinaryInplaceOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint32_t sgSize = sg.get_max_local_range()[0];
-
-            size_t base = static_cast<size_t>(elems_per_wi) *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
 
-            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
-                sycl::vec<argT, vec_sz> arg_vec;
-                sycl::vec<resT, vec_sz> res_vec;
+            size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
 
+            if (base + elems_per_wi * sgSize < nelems_) {
 #pragma unroll
-                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const size_t offset = base + it * sgSize;
                     auto rhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
+                        sycl::access::decorated::yes>(&rhs[offset]);
                     auto lhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&lhs[base + it * sgSize]);
+                        sycl::access::decorated::yes>(&lhs[offset]);
 
-                    arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
-                    res_vec = sg.load<vec_sz>(lhs_multi_ptr);
+                    const sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec =
+                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
 #pragma unroll
-                    for (std::uint32_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         op(res_vec[vec_id], arg_vec[vec_id]);
                     }
-                    sg.store<vec_sz>(lhs_multi_ptr, res_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
                 }
             }
             else {
@@ -157,14 +161,12 @@ struct BinaryInplaceContigFunctor
         }
         else {
             const size_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
+            const size_t gid = ndit.get_global_linear_id();
             const size_t elems_per_sg = elems_per_wi * sgSize;
 
-            base = (base / sgSize) * elems_per_sg + (base % sgSize);
-            for (size_t offset = base;
-                 offset < std::min(nelems_, base + elems_per_sg);
-                 offset += sgSize)
-            {
+            const size_t start = (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const size_t end = std::min(nelems_, start + elems_per_sg);
+            for (size_t offset = start; offset < end; offset += sgSize) {
                 op(lhs[offset], rhs[offset]);
             }
         }
@@ -229,7 +231,7 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
         auto sg = ndit.get_sub_group();
-        size_t gid = ndit.get_global_linear_id();
+        const size_t gid = ndit.get_global_linear_id();
 
         std::uint8_t sgSize = sg.get_max_local_range()[0];
         size_t base = gid - sg.get_local_id()[0];
@@ -243,17 +245,16 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor
                 sycl::access::address_space::global_space,
                 sycl::access::decorated::yes>(&mat[base]);
 
-            const argT vec_el = sg.load(in_multi_ptr);
-            resT mat_el = sg.load(out_multi_ptr);
+            const argT vec_el = sub_group_load(sg, in_multi_ptr);
+            resT mat_el = sub_group_load(sg, out_multi_ptr);
 
             op(mat_el, vec_el);
 
-            sg.store(out_multi_ptr, mat_el);
+            sub_group_store(sg, mat_el, out_multi_ptr);
         }
         else {
-            for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
-                 k += sgSize)
-            {
+            const size_t start = base + sg.get_local_id()[0];
+            for (size_t k = start; k < n_elems; k += sgSize) {
                 op(mat[k], padded_vec[k % n1]);
             }
         }
@@ -298,14 +299,14 @@ template <typename argTy,
           typename resTy,
           template <typename T1,
                     typename T2,
-                    unsigned int vs,
-                    unsigned int nv,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
                     bool enable_sg_loadstore>
           class BinaryInplaceContigFunctorT,
-          template <typename T1, typename T2, unsigned int vs, unsigned int nv>
+          template <typename T1, typename T2, std::uint8_t vs, std::uint8_t nv>
           class kernel_name,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u>
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
 sycl::event
 binary_inplace_contig_impl(sycl::queue &exec_q,
                            size_t nelems,
@@ -434,10 +435,10 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
 
     // sub-group spans work-items [I, I + sgSize)
     // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sg.load( &mat[base]) may load arrays from
+    // Generically, sub_group_load( &mat[base]) may load arrays from
     // different rows of mat. The start corresponds to row (base / n0)
-    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
-    // ensure that reads are accessible
+    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
+    // padded to ensure that reads are accessible
 
     const size_t lws = 128;
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index b39a606108..486174435c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -84,8 +84,8 @@ template <typename argT, typename resT> struct ConjFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ConjContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -140,7 +140,7 @@ template <typename argTy> struct ConjContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class conj_contig_kernel;
 
 template <typename argTy>
@@ -150,8 +150,8 @@ sycl::event conj_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = ConjContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = ConjContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = ConjContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = ConjContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
index db469a41ca..9ad6a6ad65 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -84,8 +84,8 @@ template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CopysignContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -142,8 +142,8 @@ struct CopysignContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class copysign_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -157,9 +157,9 @@ sycl::event copysign_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         CopysignContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         CopysignContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index b98f177777..52fbebe545 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -165,8 +165,8 @@ template <typename argT, typename resT> struct CosFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CosContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -213,7 +213,7 @@ template <typename argTy> struct CosContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class cos_contig_kernel;
 
 template <typename argTy>
@@ -223,8 +223,8 @@ sycl::event cos_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = CosContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = CosContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = CosContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = CosContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index 41bc33084c..b1752e5929 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -155,8 +155,8 @@ template <typename argT, typename resT> struct CoshFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using CoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -202,7 +202,7 @@ template <typename argTy> struct CoshContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class cosh_contig_kernel;
 
 template <typename argTy>
@@ -212,8 +212,8 @@ sycl::event cosh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = CoshContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = CoshContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = CoshContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = CoshContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index 044d6d00b3..6a455509c5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -121,8 +121,8 @@ template <typename argT1, typename argT2, typename resT> struct EqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using EqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -213,8 +213,8 @@ template <typename argTy1, typename argTy2> struct EqualContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class equal_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -228,9 +228,9 @@ sycl::event equal_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         EqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         EqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
index 6eb8d13cf5..21edeaeb31 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -124,8 +124,8 @@ template <typename argT, typename resT> struct ExpFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ExpContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -171,7 +171,7 @@ template <typename argTy> struct ExpContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class exp_contig_kernel;
 
 template <typename argTy>
@@ -181,8 +181,8 @@ sycl::event exp_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = ExpContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = ExpContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = ExpContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = ExpContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
index ed45ee45cd..df9a472329 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -126,8 +126,8 @@ template <typename argT, typename resT> struct Exp2Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Exp2ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -173,7 +173,7 @@ template <typename argTy> struct Exp2ContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class exp2_contig_kernel;
 
 template <typename argTy>
@@ -183,8 +183,8 @@ sycl::event exp2_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = Exp2ContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = Exp2ContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = Exp2ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = Exp2ContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index 87cbb70860..a8bebd7a15 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -138,8 +138,8 @@ template <typename argT, typename resT> struct Expm1Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Expm1ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -186,7 +186,7 @@ template <typename argTy> struct Expm1ContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class expm1_contig_kernel;
 
 template <typename argTy>
@@ -196,8 +196,8 @@ sycl::event expm1_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = Expm1ContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = Expm1ContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = Expm1ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = Expm1ContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
index 918f21133e..2381327766 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -80,8 +80,8 @@ template <typename argT, typename resT> struct FloorFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using FloorContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -133,7 +133,7 @@ template <typename argTy> struct FloorContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class floor_contig_kernel;
 
 template <typename argTy>
@@ -143,8 +143,8 @@ sycl::event floor_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = FloorContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = FloorContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = FloorContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = FloorContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 302631ff38..98bc9820ba 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -128,8 +128,8 @@ struct FloorDivideFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -226,8 +226,8 @@ struct FloorDivideContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class floor_divide_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -242,9 +242,9 @@ floor_divide_contig_impl(sycl::queue &exec_q,
                          ssize_t res_offset,
                          const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         FloorDivideContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         FloorDivideContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -395,8 +395,8 @@ template <typename argT, typename resT> struct FloorDivideInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using FloorDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -417,8 +417,8 @@ using FloorDivideInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class floor_divide_inplace_contig_kernel;
 
 /* @brief Types supported by in-place floor division */
@@ -468,9 +468,9 @@ floor_divide_inplace_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         FloorDivideContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         FloorDivideContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index 0191016988..588ebc780d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -122,8 +122,8 @@ template <typename argT1, typename argT2, typename resT> struct GreaterFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using GreaterContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -215,8 +215,8 @@ struct GreaterContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class greater_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -230,9 +230,9 @@ sycl::event greater_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         GreaterContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         GreaterContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index 1ef95f59fe..614fb202e1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -123,8 +123,8 @@ struct GreaterEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -216,8 +216,8 @@ struct GreaterEqualContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class greater_equal_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -232,9 +232,9 @@ greater_equal_contig_impl(sycl::queue &exec_q,
                           ssize_t res_offset,
                           const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         GreaterEqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         GreaterEqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
index 0bb2b07151..f65951f36b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -86,8 +86,8 @@ template <typename argT1, typename argT2, typename resT> struct HypotFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using HypotContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -143,8 +143,8 @@ template <typename argTy1, typename argTy2> struct HypotContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class hypot_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -158,9 +158,9 @@ sycl::event hypot_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         HypotContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         HypotContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
index 4e86278f27..1d33f83d27 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -80,8 +80,8 @@ template <typename argT, typename resT> struct ImagFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ImagContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -136,7 +136,7 @@ template <typename argTy> struct ImagContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class imag_contig_kernel;
 
 template <typename argTy>
@@ -146,8 +146,8 @@ sycl::event imag_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = ImagContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = ImagContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = ImagContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = ImagContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index 2d444b3274..067e3e36ee 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -99,8 +99,8 @@ template <typename argT, typename resT> struct IsFiniteFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using IsFiniteContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -138,7 +138,7 @@ template <typename argTy> struct IsFiniteContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class isfinite_contig_kernel;
 
 template <typename argTy>
@@ -148,9 +148,9 @@ sycl::event isfinite_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         IsFiniteContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         IsFiniteContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index 8db812bbed..70069bdaa2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -99,8 +99,8 @@ template <typename argT, typename resT> struct IsInfFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using IsInfContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -138,7 +138,7 @@ template <typename argTy> struct IsInfContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class isinf_contig_kernel;
 
 template <typename argTy>
@@ -148,8 +148,8 @@ sycl::event isinf_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = IsInfContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = IsInfContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = IsInfContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = IsInfContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index 512e1422a3..0d8a15d0b8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -97,8 +97,8 @@ template <typename argT, typename resT> struct IsNanFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using IsNanContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -136,7 +136,7 @@ template <typename argTy> struct IsNanContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class isnan_contig_kernel;
 
 template <typename argTy>
@@ -146,8 +146,8 @@ sycl::event isnan_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = IsNanContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = IsNanContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = IsNanContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = IsNanContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index e69d4ec257..43f11725b7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -120,8 +120,8 @@ template <typename argT1, typename argT2, typename resT> struct LessFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LessContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -212,8 +212,8 @@ template <typename argTy1, typename argTy2> struct LessContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class less_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -227,9 +227,9 @@ sycl::event less_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         LessContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         LessContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index 08624e3a8e..81cc375c16 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -121,8 +121,8 @@ template <typename argT1, typename argT2, typename resT> struct LessEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -214,8 +214,8 @@ struct LessEqualContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class less_equal_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -229,9 +229,9 @@ sycl::event less_equal_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         LessEqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         LessEqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
index 7020603250..13eb64afca 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -80,8 +80,8 @@ template <typename argT, typename resT> struct LogFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -128,7 +128,7 @@ template <typename argTy> struct LogContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class log_contig_kernel;
 
 template <typename argTy>
@@ -138,8 +138,8 @@ sycl::event log_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = LogContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = LogContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = LogContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = LogContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
index c7a4ac50bb..ea486239e7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -99,8 +99,8 @@ template <typename argT, typename resT> struct Log10Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Log10ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -147,7 +147,7 @@ template <typename argTy> struct Log10ContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class log10_contig_kernel;
 
 template <typename argTy>
@@ -157,8 +157,8 @@ sycl::event log10_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = Log10ContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = Log10ContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = Log10ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = Log10ContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
index aa72ed5262..3df38d05f0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -104,8 +104,8 @@ template <typename argT, typename resT> struct Log1pFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Log1pContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -152,7 +152,7 @@ template <typename argTy> struct Log1pContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class log1p_contig_kernel;
 
 template <typename argTy>
@@ -162,8 +162,8 @@ sycl::event log1p_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = Log1pContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = Log1pContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = Log1pContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = Log1pContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
index 18e5e42954..2da4c55de0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -100,8 +100,8 @@ template <typename argT, typename resT> struct Log2Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using Log2ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -148,7 +148,7 @@ template <typename argTy> struct Log2ContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class log2_contig_kernel;
 
 template <typename argTy>
@@ -158,8 +158,8 @@ sycl::event log2_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = Log2ContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = Log2ContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = Log2ContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = Log2ContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index e191d2dd72..6d2375c20d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -101,8 +101,8 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -159,8 +159,8 @@ struct LogAddExpContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class logaddexp_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -174,9 +174,9 @@ sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         LogAddExpContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         LogAddExpContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
index ef01dc8a53..768ace7754 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -95,8 +95,8 @@ struct LogicalAndFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -184,8 +184,8 @@ struct LogicalAndContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class logical_and_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -200,9 +200,9 @@ logical_and_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         LogicalAndContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         LogicalAndContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
index 22e7aaa58a..53c5404caa 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -69,8 +69,8 @@ template <typename argT, typename resT> struct LogicalNotFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalNotContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -111,7 +111,7 @@ template <typename argTy> struct LogicalNotContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class logical_not_contig_kernel;
 
 template <typename argTy>
@@ -122,9 +122,9 @@ logical_not_contig_impl(sycl::queue &exec_q,
                         char *res_p,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         LogicalNotContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         LogicalNotContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
index 56baba9367..93c5f3b9a6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -94,8 +94,8 @@ template <typename argT1, typename argT2, typename resT> struct LogicalOrFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -183,8 +183,8 @@ struct LogicalOrContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class logical_or_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -198,9 +198,9 @@ sycl::event logical_or_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         LogicalOrContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         LogicalOrContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
index 2beb96777a..9ff54b6f16 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -96,8 +96,8 @@ struct LogicalXorFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -185,8 +185,8 @@ struct LogicalXorContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class logical_xor_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -201,9 +201,9 @@ logical_xor_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         LogicalXorContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         LogicalXorContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index 374a2bbceb..ed44b8ade7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -108,8 +108,8 @@ template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MaximumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -217,8 +217,8 @@ struct MaximumContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class maximum_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -232,9 +232,9 @@ sycl::event maximum_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         MaximumContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         MaximumContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 274f1b8f89..551daf0498 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -108,8 +108,8 @@ template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MinimumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -217,8 +217,8 @@ struct MinimumContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class minimum_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -232,9 +232,9 @@ sycl::event minimum_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         MinimumContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         MinimumContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index a684ba27fa..37b3803c27 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -100,8 +100,8 @@ template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MultiplyContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -209,8 +209,8 @@ struct MultiplyContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class multiply_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -224,9 +224,9 @@ sycl::event multiply_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         MultiplyContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         MultiplyContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -430,8 +430,8 @@ template <typename argT, typename resT> struct MultiplyInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using MultiplyInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -452,8 +452,8 @@ using MultiplyInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class multiply_inplace_contig_kernel;
 
 /* @brief Types supported by in-place multiplication */
@@ -510,9 +510,9 @@ multiply_inplace_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         MultiplyContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         MultiplyContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index 54ce47a54e..a036158ccd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -67,8 +67,8 @@ template <typename argT, typename resT> struct NegativeFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using NegativeContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -118,7 +118,7 @@ template <typename argTy> struct NegativeContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class negative_contig_kernel;
 
 template <typename argTy>
@@ -128,9 +128,9 @@ sycl::event negative_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         NegativeContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         NegativeContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
index f91df44868..b58b1b98ef 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -84,8 +84,8 @@ template <typename argT1, typename argT2, typename resT> struct NextafterFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using NextafterContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -142,8 +142,8 @@ struct NextafterContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class nextafter_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -157,9 +157,9 @@ sycl::event nextafter_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         NextafterContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         NextafterContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index f5c6fb1fe5..be1231648c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -105,8 +105,8 @@ template <typename argT1, typename argT2, typename resT> struct NotEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using NotEqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -198,8 +198,8 @@ struct NotEqualContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class not_equal_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -213,9 +213,9 @@ sycl::event not_equal_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         NotEqualContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         NotEqualContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index 3976792af8..3ccca611d8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -82,8 +82,8 @@ template <typename argT, typename resT> struct PositiveFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using PositiveContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -133,7 +133,7 @@ template <typename argTy> struct PositiveContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class positive_contig_kernel;
 
 template <typename argTy>
@@ -143,9 +143,9 @@ sycl::event positive_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         PositiveContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         PositiveContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index 4f01215bd3..353e516d28 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -153,8 +153,8 @@ template <typename argT1, typename argT2, typename resT> struct PowFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using PowContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -260,8 +260,8 @@ template <typename argTy1, typename argTy2> struct PowContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class pow_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -275,9 +275,9 @@ sycl::event pow_contig_impl(sycl::queue &exec_q,
                             ssize_t res_offset,
                             const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         PowContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         PowContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -443,8 +443,8 @@ template <typename argT, typename resT> struct PowInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
@@ -464,8 +464,8 @@ using PowInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class pow_inplace_contig_kernel;
 
 /* @brief Types supported by in-place pow */
@@ -521,9 +521,9 @@ pow_inplace_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         PowContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         PowContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
index 8fc5226c55..1297dab283 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -93,8 +93,8 @@ template <typename argT, typename resT> struct ProjFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ProjContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -137,7 +137,7 @@ template <typename argTy> struct ProjContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class proj_contig_kernel;
 
 template <typename argTy>
@@ -147,8 +147,8 @@ sycl::event proj_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = ProjContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = ProjContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = ProjContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = ProjContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
index 9af7fec9e4..270b613346 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -80,8 +80,8 @@ template <typename argT, typename resT> struct RealFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RealContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -136,7 +136,7 @@ template <typename argTy> struct RealContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class real_contig_kernel;
 
 template <typename argTy>
@@ -146,8 +146,8 @@ sycl::event real_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = RealContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = RealContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = RealContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = RealContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
index 7e314cb71d..90909ea772 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -83,8 +83,8 @@ template <typename argT, typename resT> struct ReciprocalFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using ReciprocalContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -133,7 +133,7 @@ template <typename argTy> struct ReciprocalContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class reciprocal_contig_kernel;
 
 template <typename argTy>
@@ -143,9 +143,9 @@ sycl::event reciprocal_contig_impl(sycl::queue &exec_q,
                                    char *res_p,
                                    const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         ReciprocalContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         ReciprocalContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 68e5c6134a..57467d56b3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -146,8 +146,8 @@ template <typename argT1, typename argT2, typename resT> struct RemainderFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -244,8 +244,8 @@ struct RemainderContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class remainder_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -259,9 +259,9 @@ sycl::event remainder_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         RemainderContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         RemainderContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -421,8 +421,8 @@ template <typename argT, typename resT> struct RemainderInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RemainderInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -443,8 +443,8 @@ using RemainderInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class remainder_inplace_contig_kernel;
 
 /* @brief Types supported by in-place remainder */
@@ -492,9 +492,9 @@ remainder_inplace_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         RemainderContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         RemainderContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index a0358dbbba..60ea58f7c3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -89,8 +89,8 @@ template <typename argT, typename resT> struct RoundFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RoundContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -144,7 +144,7 @@ template <typename argTy> struct RoundContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class round_contig_kernel;
 
 template <typename argTy>
@@ -154,8 +154,8 @@ sycl::event round_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = RoundContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = RoundContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = RoundContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = RoundContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index ed8e6f16f7..f92dac50b1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -70,8 +70,8 @@ template <typename argT, typename resT> struct RsqrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using RsqrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -115,7 +115,7 @@ template <typename argTy> struct RsqrtContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class rsqrt_contig_kernel;
 
 template <typename argTy>
@@ -125,8 +125,8 @@ sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = RsqrtContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = RsqrtContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = RsqrtContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = RsqrtContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index c222b063a1..ffb4183474 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -105,8 +105,8 @@ template <typename argT, typename resT> struct SignFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SignContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -156,7 +156,7 @@ template <typename argTy> struct SignContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class sign_contig_kernel;
 
 template <typename argTy>
@@ -166,8 +166,8 @@ sycl::event sign_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = SignContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = SignContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = SignContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = SignContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index 69e5cff29e..7ba04fcd17 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -77,8 +77,8 @@ template <typename argT, typename resT> struct SignbitFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SignbitContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
@@ -122,7 +122,7 @@ template <typename argTy> struct SignbitContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class signbit_contig_kernel;
 
 template <typename argTy>
@@ -132,9 +132,9 @@ sycl::event signbit_contig_impl(sycl::queue &exec_q,
                                 char *res_p,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         SignbitContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         SignbitContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index f3fe1105b6..596c1de9e4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -188,8 +188,8 @@ template <typename argT, typename resT> struct SinFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SinContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -235,7 +235,7 @@ template <typename argTy> struct SinContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class sin_contig_kernel;
 
 template <typename argTy>
@@ -245,8 +245,8 @@ sycl::event sin_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = SinContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = SinContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = SinContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = SinContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index a7df2789de..6d418872b8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -157,8 +157,8 @@ template <typename argT, typename resT> struct SinhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -204,7 +204,7 @@ template <typename argTy> struct SinhContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class sinh_contig_kernel;
 
 template <typename argTy>
@@ -214,8 +214,8 @@ sycl::event sinh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = SinhContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = SinhContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = SinhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = SinhContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 1bc5b25662..6dcb2ca742 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -82,8 +82,8 @@ template <typename argT, typename resT> struct SqrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SqrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -130,7 +130,7 @@ template <typename argTy> struct SqrtContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class sqrt_contig_kernel;
 
 template <typename argTy>
@@ -140,8 +140,8 @@ sycl::event sqrt_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = SqrtContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = SqrtContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = SqrtContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = SqrtContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
index 9135515091..dbf665b79c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -99,8 +99,8 @@ template <typename argT, typename resT> struct SquareFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SquareContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -155,7 +155,7 @@ template <typename argTy> struct SquareContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class square_contig_kernel;
 
 template <typename argTy>
@@ -165,9 +165,9 @@ sycl::event square_contig_impl(sycl::queue &exec_q,
                                char *res_p,
                                const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         SquareContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         SquareContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index 6bfa1ffab0..47ca000c3f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -87,8 +87,8 @@ template <typename argT1, typename argT2, typename resT> struct SubtractFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SubtractContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -195,8 +195,8 @@ struct SubtractContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class subtract_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -210,9 +210,9 @@ sycl::event subtract_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         SubtractContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         SubtractContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -429,8 +429,8 @@ template <typename argT, typename resT> struct SubtractInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using SubtractInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -451,8 +451,8 @@ using SubtractInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class subtract_inplace_contig_kernel;
 
 /* @brief Types supported by in-place subtraction */
@@ -508,9 +508,9 @@ subtract_inplace_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         SubtractContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         SubtractContigHyperparameterSet<resTy, argTy>::n_vecs;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index 17d423466e..a7da718a4b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -132,8 +132,8 @@ template <typename argT, typename resT> struct TanFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TanContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -179,7 +179,7 @@ template <typename argTy> struct TanContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class tan_contig_kernel;
 
 template <typename argTy>
@@ -189,8 +189,8 @@ sycl::event tan_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = TanContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = TanContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = TanContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = TanContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index a64313c0ca..626420d48b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -126,8 +126,8 @@ template <typename argT, typename resT> struct TanhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TanhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -173,7 +173,7 @@ template <typename argTy> struct TanhContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class tanh_contig_kernel;
 
 template <typename argTy>
@@ -183,8 +183,8 @@ sycl::event tanh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = TanhContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = TanhContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = TanhContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = TanhContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 0f429f662b..27de2069ff 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -114,8 +114,8 @@ struct TrueDivideFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -202,8 +202,8 @@ struct TrueDivideContigHyperparameterSet
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class true_divide_contig_kernel;
 
 template <typename argTy1, typename argTy2>
@@ -218,9 +218,9 @@ true_divide_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         TrueDivideContigHyperparameterSet<argTy1, argTy2>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         TrueDivideContigHyperparameterSet<argTy1, argTy2>::n_vecs;
 
     return elementwise_common::binary_contig_impl<
@@ -501,8 +501,8 @@ struct TrueDivideInplaceTypeMapFactory
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TrueDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
@@ -523,8 +523,8 @@ using TrueDivideInplaceStridedFunctor =
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz,
-          unsigned int n_vecs>
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
 class true_divide_inplace_contig_kernel;
 
 template <typename argTy, typename resTy>
@@ -537,9 +537,9 @@ true_divide_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz =
+    constexpr std::uint8_t vec_sz =
         TrueDivideContigHyperparameterSet<resTy, argTy>::vec_sz;
-    constexpr unsigned int n_vecs =
+    constexpr std::uint8_t n_vecs =
         TrueDivideContigHyperparameterSet<resTy, argTy>::vec_sz;
 
     return elementwise_common::binary_inplace_contig_impl<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
index b014959f9f..cf9d6fa14f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -77,8 +77,8 @@ template <typename argT, typename resT> struct TruncFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4u,
-          unsigned int n_vecs = 2u,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 using TruncContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
@@ -130,7 +130,7 @@ template <typename argTy> struct TruncContigHyperparameterSet
 
 } // end of anonymous namespace
 
-template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class trunc_contig_kernel;
 
 template <typename argTy>
@@ -140,8 +140,8 @@ sycl::event trunc_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr unsigned int vec_sz = TruncContigHyperparameterSet<argTy>::vec_sz;
-    constexpr unsigned int n_vecs = TruncContigHyperparameterSet<argTy>::n_vecs;
+    constexpr std::uint8_t vec_sz = TruncContigHyperparameterSet<argTy>::vec_sz;
+    constexpr std::uint8_t n_vecs = TruncContigHyperparameterSet<argTy>::n_vecs;
 
     return elementwise_common::unary_contig_impl<
         argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
index b827db28b9..0be2c68c3b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -41,30 +41,30 @@ template <typename Ty1,
           typename ArgTy1,
           typename Ty2,
           typename ArgTy2,
-          unsigned int vec_sz_v,
-          unsigned int n_vecs_v>
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
 struct BinaryContigHyperparameterSetEntry
     : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
 {
-    static constexpr unsigned int vec_sz = vec_sz_v;
-    static constexpr unsigned int n_vecs = n_vecs_v;
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
 };
 
 template <typename Ty,
           typename ArgTy,
-          unsigned int vec_sz_v,
-          unsigned int n_vecs_v>
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
 struct UnaryContigHyperparameterSetEntry : std::is_same<Ty, ArgTy>
 {
-    static constexpr unsigned int vec_sz = vec_sz_v;
-    static constexpr unsigned int n_vecs = n_vecs_v;
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
 };
 
-template <unsigned int vec_sz_v, unsigned int n_vecs_v>
+template <std::uint8_t vec_sz_v, std::uint8_t n_vecs_v>
 struct ContigHyperparameterSetDefault : std::true_type
 {
-    static constexpr unsigned int vec_sz = vec_sz_v;
-    static constexpr unsigned int n_vecs = n_vecs_v;
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
 };
 
 } // end of namespace vec_size_utils
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 3ad465db6a..cfeac18375 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -421,6 +421,72 @@ struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
     static constexpr T value = sycl::known_identity<Op, T>::value;
 };
 
+// Sub-group load/store
+
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+namespace ls_ns = sycl::ext::oneapi::experimental;
+#endif
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+    sycl::vec<ElementType, vec_sz> x;
+    ls_ns::group_load(sg, m_ptr, x);
+    return x;
+#else
+    return sg.load<vec_sz>(m_ptr);
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+    ElementType x;
+    ls_ns::group_load(sg, m_ptr, x);
+    return x;
+#else
+    return sg.load(m_ptr);
+#endif
+}
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+void sub_group_store(const sycl::sub_group &sg,
+                     const sycl::vec<ElementType, vec_sz> &val,
+                     sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+    ls_ns::group_store(sg, val, m_ptr);
+#else
+    sg.store<vec_sz>(m_ptr, val);
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+void sub_group_store(const sycl::sub_group &sg,
+                     const ElementType &val,
+                     sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+    ls_ns::group_store(sg, val, m_ptr);
+#else
+    sg.store(m_ptr, val);
+#endif
+}
+
 } // namespace sycl_utils
 } // namespace tensor
 } // namespace dpctl

From 7d22eacaeb42371ea44a639569e1f53c00812471 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 18 Nov 2024 12:11:21 -0600
Subject: [PATCH 09/13] Replace sg.load/sg.store with
 sub_group_load/sub_group_store utilities

---
 .../tensor/libtensor/include/kernels/clip.hpp | 69 +++++++++---------
 .../libtensor/include/kernels/where.hpp       | 72 ++++++++++---------
 2 files changed, 75 insertions(+), 66 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp
index 7b422c1281..66bedfd1cd 100644
--- a/dpctl/tensor/libtensor/include/kernels/clip.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp
@@ -33,6 +33,7 @@
 #include "kernels/alignment.hpp"
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
 namespace dpctl
@@ -51,6 +52,9 @@ using dpctl::tensor::kernels::alignment_utils::
 using dpctl::tensor::kernels::alignment_utils::is_aligned;
 using dpctl::tensor::kernels::alignment_utils::required_alignment;
 
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
 template <typename T> T clip(const T &x, const T &min, const T &max)
 {
     using dpctl::tensor::type_utils::is_complex;
@@ -75,8 +79,8 @@ template <typename T> T clip(const T &x, const T &min, const T &max)
 }
 
 template <typename T,
-          int vec_sz = 4,
-          int n_vecs = 2,
+          std::uint8_t vec_sz = 4,
+          std::uint8_t n_vecs = 2,
           bool enable_sg_loadstore = true>
 class ClipContigFunctor
 {
@@ -100,37 +104,36 @@ class ClipContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
-
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
-            for (size_t offset = base;
-                 offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
-                 offset += sgSize)
-            {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const size_t gid = ndit.get_global_linear_id();
+            const uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+
+            const size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const size_t end = std::min(nelems, start + nelems_per_sg);
+
+            for (size_t offset = start; offset < end; offset += sgSize) {
                 dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
             }
         }
         else {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * max_sgSize);
-
-            if (base + n_vecs * vec_sz * sgSize < nelems &&
-                sgSize == max_sgSize)
-            {
-                sycl::vec<T, vec_sz> x_vec;
-                sycl::vec<T, vec_sz> min_vec;
-                sycl::vec<T, vec_sz> max_vec;
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
                 sycl::vec<T, vec_sz> dst_vec;
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    auto idx = base + it * sgSize;
+                    const size_t idx = base + it * sgSize;
                     auto x_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&x_p[idx]);
@@ -144,21 +147,23 @@ class ClipContigFunctor
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&dst_p[idx]);
 
-                    x_vec = sg.load<vec_sz>(x_multi_ptr);
-                    min_vec = sg.load<vec_sz>(min_multi_ptr);
-                    max_vec = sg.load<vec_sz>(max_multi_ptr);
+                    const sycl::vec<T, vec_sz> x_vec =
+                        sub_group_load<vec_sz>(sg, x_multi_ptr);
+                    const sycl::vec<T, vec_sz> min_vec =
+                        sub_group_load<vec_sz>(sg, min_multi_ptr);
+                    const sycl::vec<T, vec_sz> max_vec =
+                        sub_group_load<vec_sz>(sg, max_multi_ptr);
 #pragma unroll
                     for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
                                                max_vec[vec_id]);
                     }
-                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems; k += sgSize) {
                     dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
                 }
             }
@@ -195,8 +200,8 @@ sycl::event clip_contig_impl(sycl::queue &q,
         cgh.depends_on(depends);
 
         size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
+        constexpr std::uint8_t vec_sz = 4;
+        constexpr std::uint8_t n_vecs = 2;
         const size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
index b356c256c3..dbf3fdfedf 100644
--- a/dpctl/tensor/libtensor/include/kernels/where.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/where.hpp
@@ -32,6 +32,7 @@
 #include "dpctl_tensor_types.hpp"
 #include "kernels/alignment.hpp"
 #include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
 namespace dpctl
@@ -50,15 +51,18 @@ using dpctl::tensor::kernels::alignment_utils::
 using dpctl::tensor::kernels::alignment_utils::is_aligned;
 using dpctl::tensor::kernels::alignment_utils::required_alignment;
 
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
 template <typename T, typename condT, typename IndexerT>
 class where_strided_kernel;
-template <typename T, typename condT, int vec_sz, int n_vecs>
+template <typename T, typename condT, std::uint8_t vec_sz, std::uint8_t n_vecs>
 class where_contig_kernel;
 
 template <typename T,
           typename condT,
-          int vec_sz = 4,
-          int n_vecs = 2,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
           bool enable_sg_loadstore = true>
 class WhereContigFunctor
 {
@@ -82,42 +86,40 @@ class WhereContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
                       is_complex<T>::value)
         {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
-
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
-            for (size_t offset = base;
-                 offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
-                 offset += sgSize)
-            {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const size_t gid = ndit.get_global_linear_id();
+
+            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+            const size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const size_t end = std::min(nelems, start + nelems_per_sg);
+            for (size_t offset = start; offset < end; offset += sgSize) {
                 using dpctl::tensor::type_utils::convert_impl;
-                bool check = convert_impl<bool, condT>(cond_p[offset]);
+                const bool check = convert_impl<bool, condT>(cond_p[offset]);
                 dst_p[offset] = check ? x1_p[offset] : x2_p[offset];
             }
         }
         else {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * max_sgSize);
-
-            if (base + n_vecs * vec_sz * sgSize < nelems &&
-                sgSize == max_sgSize)
-            {
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
                 sycl::vec<T, vec_sz> dst_vec;
-                sycl::vec<T, vec_sz> x1_vec;
-                sycl::vec<T, vec_sz> x2_vec;
-                sycl::vec<condT, vec_sz> cond_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    auto idx = base + it * sgSize;
+                    const size_t idx = base + it * sgSize;
                     auto x1_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&x1_p[idx]);
@@ -131,20 +133,22 @@ class WhereContigFunctor
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&dst_p[idx]);
 
-                    x1_vec = sg.load<vec_sz>(x1_multi_ptr);
-                    x2_vec = sg.load<vec_sz>(x2_multi_ptr);
-                    cond_vec = sg.load<vec_sz>(cond_multi_ptr);
+                    const sycl::vec<T, vec_sz> x1_vec =
+                        sub_group_load<vec_sz>(sg, x1_multi_ptr);
+                    const sycl::vec<T, vec_sz> x2_vec =
+                        sub_group_load<vec_sz>(sg, x2_multi_ptr);
+                    const sycl::vec<condT, vec_sz> cond_vec =
+                        sub_group_load<vec_sz>(sg, cond_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
                     }
-                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems; k += sgSize) {
                     dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k];
                 }
             }
@@ -179,8 +183,8 @@ sycl::event where_contig_impl(sycl::queue &q,
         cgh.depends_on(depends);
 
         size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
+        constexpr std::uint8_t vec_sz = 4u;
+        constexpr std::uint8_t n_vecs = 2u;
         const size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);

From 8809be24ef04ab902f1f47bc8269ba4f3d594023 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 18 Nov 2024 13:23:52 -0600
Subject: [PATCH 10/13] Replaced missed instance of sg.store with
 sub_group_store

---
 .../tensor/libtensor/include/kernels/copy_as_contiguous.hpp  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index 7b95016db3..9a138c8f13 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -31,6 +31,7 @@
 #include "dpctl_tensor_types.hpp"
 #include "kernels/alignment.hpp"
 #include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
 namespace dpctl
@@ -42,6 +43,8 @@ namespace kernels
 namespace copy_as_contig
 {
 
+using dpctl::tensor::sycl_utils::sub_group_store;
+
 template <typename T,
           typename IndexerT,
           std::uint8_t vec_sz = 4u,
@@ -113,7 +116,7 @@ class CopyAsCContigFunctor
                         const ssize_t src_offset = src_indexer(elem_id);
                         dst_vec[k] = src_p[src_offset];
                     }
-                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
                 }
             }
             else {

From 57065ff9f857246e43f3280efd88018ed68364a4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 18 Nov 2024 15:01:37 -0600
Subject: [PATCH 11/13] Define USE_GROUP_LOAD_STORE variable

Predicate use of experimental extension on this variable being set.

Since use of this experimental extension, as implemented by oneAPI
DPC++ 2025.0.0, causes test failures in `dpctl`, the use of this
extension is turned off for DPC++ 2025.0.0
---
 .../libtensor/include/utils/sycl_utils.hpp    | 57 ++++++++++++++-----
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index cfeac18375..6cd3c3429e 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -423,7 +423,20 @@ struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
 
 // Sub-group load/store
 
-#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+#ifndef USE_GROUP_LOAD_STORE
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE) &&                               \
+    SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
+#define USE_GROUP_LOAD_STORE 1
+#else
+#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMIPLER > 20250000u)
+#define USE_GROUP_LOAD_STORE 1
+#else
+#define USE_GROUP_LOAD_STORE 0
+#endif
+#endif
+#endif
+
+#if (USE_GROUP_LOAD_STORE)
 namespace ls_ns = sycl::ext::oneapi::experimental;
 #endif
 
@@ -434,8 +447,9 @@ template <std::uint8_t vec_sz,
 auto sub_group_load(const sycl::sub_group &sg,
                     sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
 {
-#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
-    sycl::vec<ElementType, vec_sz> x;
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    sycl::vec<ValueT, vec_sz> x{};
     ls_ns::group_load(sg, m_ptr, x);
     return x;
 #else
@@ -449,8 +463,9 @@ template <sycl::access::address_space Space,
 auto sub_group_load(const sycl::sub_group &sg,
                     sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
 {
-#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
-    ElementType x;
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    ValueT x{};
     ls_ns::group_load(sg, m_ptr, x);
     return x;
 #else
@@ -461,29 +476,41 @@ auto sub_group_load(const sycl::sub_group &sg,
 template <std::uint8_t vec_sz,
           sycl::access::address_space Space,
           sycl::access::decorated DecorateAddress,
+          typename VecT,
           typename ElementType>
-void sub_group_store(const sycl::sub_group &sg,
-                     const sycl::vec<ElementType, vec_sz> &val,
-                     sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
-{
-#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+sub_group_store(const sycl::sub_group &sg,
+                const sycl::vec<VecT, vec_sz> &val,
+                sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
     ls_ns::group_store(sg, val, m_ptr);
+    return;
 #else
     sg.store<vec_sz>(m_ptr, val);
+    return;
 #endif
 }
 
 template <sycl::access::address_space Space,
           sycl::access::decorated DecorateAddress,
+          typename VecT,
           typename ElementType>
-void sub_group_store(const sycl::sub_group &sg,
-                     const ElementType &val,
-                     sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
-{
-#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE)
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+sub_group_store(const sycl::sub_group &sg,
+                const VecT &val,
+                sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
     ls_ns::group_store(sg, val, m_ptr);
+    return;
 #else
     sg.store(m_ptr, val);
+    return;
 #endif
 }
 

From 2531261b1001bf718d7eca322b0604a7080ca3ca Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 19 Nov 2024 12:32:33 -0800
Subject: [PATCH 12/13] Bump minimal version of __INTEL_LLVM_COMPILER needed to
 enable group_load/group_store

---
 .../include/kernels/copy_as_contiguous.hpp          | 13 +++++++------
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 11 ++++++-----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index 9a138c8f13..1a44946cc4 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -77,12 +77,12 @@ class CopyAsCContigFunctor
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
             const std::uint16_t sgSize =
-                ndit.get_sub_group().get_local_range()[0];
+                ndit.get_sub_group().get_max_local_range()[0];
             const std::size_t gid = ndit.get_global_linear_id();
 
             // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
             // gid % sgSize == gid - (gid / sgSize) * sgSize
-            const std::size_t elems_per_sg = sgSize * elems_per_wi;
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
             const std::size_t start =
                 (gid / sgSize) * (elems_per_sg - sgSize) + gid;
             const std::size_t end = std::min(nelems, start + elems_per_sg);
@@ -98,20 +98,21 @@ class CopyAsCContigFunctor
             const size_t base =
                 elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
                                 sg.get_group_id()[0] * sgSize);
+            const std::uint16_t elems_per_sg = elems_per_wi * sgSize;
 
-            if (base + elems_per_wi * sgSize < nelems) {
-                sycl::vec<T, vec_sz> dst_vec;
-
+            if (base + elems_per_sg < nelems) {
 #pragma unroll
                 for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    // it == vec_id * vec_sz, for  0 <= vec_id < n_vecs
                     const size_t block_start_id = base + it * sgSize;
                     auto dst_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&dst_p[block_start_id]);
 
                     const size_t elem_id0 = block_start_id + sg.get_local_id();
+                    sycl::vec<T, vec_sz> dst_vec;
 #pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; k++) {
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         const size_t elem_id = elem_id0 + k * sgSize;
                         const ssize_t src_offset = src_indexer(elem_id);
                         dst_vec[k] = src_p[src_offset];
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 6cd3c3429e..de19334dde 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -428,7 +428,7 @@ struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
     SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
 #define USE_GROUP_LOAD_STORE 1
 #else
-#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMIPLER > 20250000u)
+#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER >= 20250100u)
 #define USE_GROUP_LOAD_STORE 1
 #else
 #define USE_GROUP_LOAD_STORE 0
@@ -450,7 +450,7 @@ auto sub_group_load(const sycl::sub_group &sg,
 #if (USE_GROUP_LOAD_STORE)
     using ValueT = typename std::remove_cv_t<ElementType>;
     sycl::vec<ValueT, vec_sz> x{};
-    ls_ns::group_load(sg, m_ptr, x);
+    ls_ns::group_load(sg, m_ptr, x, ls_ns::data_placement_blocked);
     return x;
 #else
     return sg.load<vec_sz>(m_ptr);
@@ -466,7 +466,7 @@ auto sub_group_load(const sycl::sub_group &sg,
 #if (USE_GROUP_LOAD_STORE)
     using ValueT = typename std::remove_cv_t<ElementType>;
     ValueT x{};
-    ls_ns::group_load(sg, m_ptr, x);
+    ls_ns::group_load(sg, m_ptr, x, ls_ns::data_placement_blocked);
     return x;
 #else
     return sg.load(m_ptr);
@@ -486,7 +486,8 @@ sub_group_store(const sycl::sub_group &sg,
                 sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
 {
 #if (USE_GROUP_LOAD_STORE)
-    ls_ns::group_store(sg, val, m_ptr);
+    static_assert(std::is_same_v<VecT, ElementType>);
+    ls_ns::group_store(sg, val, m_ptr, ls_ns::data_placement_blocked);
     return;
 #else
     sg.store<vec_sz>(m_ptr, val);
@@ -506,7 +507,7 @@ sub_group_store(const sycl::sub_group &sg,
                 sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
 {
 #if (USE_GROUP_LOAD_STORE)
-    ls_ns::group_store(sg, val, m_ptr);
+    ls_ns::group_store(sg, val, m_ptr, ls_ns::data_placement_blocked);
     return;
 #else
     sg.store(m_ptr, val);

From 88c3e1a5455190f6a685b00f7de6a3023be9c806 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 21 Nov 2024 16:05:30 -0600
Subject: [PATCH 13/13] Do not use group_load/group_store with 2025.1.0 just
 yet

---
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index de19334dde..19be8645c9 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -428,7 +428,7 @@ struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
     SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
 #define USE_GROUP_LOAD_STORE 1
 #else
-#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER >= 20250100u)
+#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER > 20250100u)
 #define USE_GROUP_LOAD_STORE 1
 #else
 #define USE_GROUP_LOAD_STORE 0