From 20a1966a08fd4d129ac3307a734d14de5e519ac1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 21 Oct 2024 05:26:18 -0500
Subject: [PATCH 1/9] Do not use sg.get_local_range

Use sg.get_max_local_range instead. The `sg.get_local_range` must perform
lots of checks to determine if this is the last trailing sub-group in the
work-group and its actual size may be smaller. We set the local work-group
size to be 128, which is a multiple of any sub-group size, and hence
get_local_range() always equals to get_max_local_raneg().
---
 .../kernels/elementwise_functions/common.hpp  | 58 +++++++------------
 .../elementwise_functions/common_inplace.hpp  | 18 +++---
 2 files changed, 29 insertions(+), 47 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index ee955dcde5..526bbb7bde 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -72,7 +72,7 @@ struct UnaryContigFunctor
     {
         UnaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
-        /* NOTE: vec_sz must divide sg.max_local_range()[0] */
+        /* NOTE: work-group size must be divisible by sub-group size */
 
         if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
         {
@@ -80,14 +80,11 @@ struct UnaryContigFunctor
             constexpr resT const_val = UnaryOperatorT::constant_value;
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
-            if (base + n_vecs * vec_sz * sgSize < nelems_ &&
-                max_sgSize == sgSize)
-            {
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
                 sycl::vec<resT, vec_sz> res_vec(const_val);
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
@@ -113,14 +110,11 @@ struct UnaryContigFunctor
                            UnaryOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint16_t sgSize = sg.get_local_range()[0];
-            std::uint16_t max_sgSize = sg.get_max_local_range()[0];
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * max_sgSize);
-            if (base + n_vecs * vec_sz * sgSize < nelems_ &&
-                sgSize == max_sgSize)
-            {
+                           sg.get_group_id()[0] * sgSize);
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
                 sycl::vec<argT, vec_sz> x;
 
 #pragma unroll
@@ -155,15 +149,12 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * maxsgSize);
+                           sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (maxsgSize == sgSize))
-            {
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
 
 #pragma unroll
@@ -199,15 +190,12 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * maxsgSize);
+                           sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (maxsgSize == sgSize))
-            {
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
@@ -406,22 +394,20 @@ struct BinaryContigFunctor
     {
         BinaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
 
         if constexpr (enable_sg_loadstore &&
                       BinaryOperatorT::supports_sg_loadstore::value &&
                       BinaryOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
 
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
@@ -458,16 +444,13 @@ struct BinaryContigFunctor
                            BinaryOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
 
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
@@ -582,13 +565,15 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        /* NOTE: work-group size must be divisible by sub-group size */
+
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
         auto sg = ndit.get_sub_group();
         size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_local_range()[0];
+        std::uint8_t sgSize = sg.get_max_local_range()[0];
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
@@ -647,13 +632,14 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        /* NOTE: work-group size must be divisible by sub-group size */
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
         auto sg = ndit.get_sub_group();
         size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_local_range()[0];
+        std::uint8_t sgSize = sg.get_max_local_range()[0];
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index cbb079e3c5..8ece9ff910 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -73,22 +73,20 @@ struct BinaryInplaceContigFunctor
     {
         BinaryInplaceOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NB: Workgroup size must be divisible by sub-group size */
 
         if constexpr (enable_sg_loadstore &&
                       BinaryInplaceOperatorT::supports_sg_loadstore::value &&
                       BinaryInplaceOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
 
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
 
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
@@ -121,16 +119,13 @@ struct BinaryInplaceContigFunctor
                            BinaryInplaceOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t maxsgSize = sg.get_max_local_range()[0];
+            std::uint8_t sgSize = sg.get_max_local_range()[0];
 
             size_t base = n_vecs * vec_sz *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
-                (sgSize == maxsgSize))
-            {
+            if (base + n_vecs * vec_sz * sgSize < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
@@ -228,13 +223,14 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        /* Workgroup size is expected to be a multiple of sub-group size */
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
         auto sg = ndit.get_sub_group();
         size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_local_range()[0];
+        std::uint8_t sgSize = sg.get_max_local_range()[0];
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {

From 9240d5be22b10e915dd2e921090b41b748b86423 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 21 Oct 2024 05:29:18 -0500
Subject: [PATCH 2/9] Add UL suffix literal integral value

---
 dpctl/tensor/libtensor/include/kernels/alignment.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
index ff4541af4d..9ec14dd027 100644
--- a/dpctl/tensor/libtensor/include/kernels/alignment.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
@@ -30,7 +30,7 @@ namespace kernels
 namespace alignment_utils
 {
 
-static constexpr size_t required_alignment = 64;
+static constexpr size_t required_alignment = 64UL;
 
 template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
 {

From f623209115a4e201ffbd05c797db512643e61f6c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 24 Oct 2024 09:38:10 -0500
Subject: [PATCH 3/9] Clean-ups in binary/unary contig call operator

---
 .../kernels/elementwise_functions/common.hpp  | 100 +++++++++---------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 526bbb7bde..4c27fefa0f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -70,6 +70,7 @@ struct UnaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz;
         UnaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
@@ -80,14 +81,15 @@ struct UnaryContigFunctor
             constexpr resT const_val = UnaryOperatorT::constant_value;
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<resT, vec_sz> res_vec(const_val);
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto out_multi_ptr = sycl::address_space_cast<
@@ -98,9 +100,8 @@ struct UnaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = const_val;
                 }
             }
@@ -110,15 +111,16 @@ struct UnaryContigFunctor
                            UnaryOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint16_t sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> x;
 
 #pragma unroll
-                for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in_multi_ptr = sycl::address_space_cast<
@@ -134,9 +136,8 @@ struct UnaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     // scalar call
                     out[k] = op(in[k]);
                 }
@@ -149,16 +150,16 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in_multi_ptr = sycl::address_space_cast<
@@ -170,16 +171,15 @@ struct UnaryContigFunctor
 
                     arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
                         arg_vec[k] = op(arg_vec[k]);
                     }
                     sg.store<vec_sz>(out_multi_ptr, arg_vec);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in[k]);
                 }
             }
@@ -190,17 +190,17 @@ struct UnaryContigFunctor
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in_multi_ptr = sycl::address_space_cast<
@@ -212,27 +212,27 @@ struct UnaryContigFunctor
 
                     arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
                         res_vec[k] = op(arg_vec[k]);
                     }
                     sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in[k]);
                 }
             }
         }
         else {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            size_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
+            const size_t elems_per_sg = sgSize * elems_per_wi;
 
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            base = (base / sgSize) * elems_per_sg + (base % sgSize);
             for (size_t offset = base;
-                 offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz));
+                 offset < std::min(nelems_, base + elems_per_sg);
                  offset += sgSize)
             {
                 out[offset] = op(in[offset]);
@@ -392,6 +392,7 @@ struct BinaryContigFunctor
 
     void operator()(sycl::nd_item<1> ndit) const
     {
+        constexpr std::uint32_t elems_per_wi = n_vecs * vec_sz;
         BinaryOperatorT op{};
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
@@ -401,19 +402,19 @@ struct BinaryContigFunctor
                       BinaryOperatorT::supports_vec::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_max_local_range()[0];
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in1_multi_ptr = sycl::address_space_cast<
@@ -433,9 +434,8 @@ struct BinaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in1[k], in2[k]);
                 }
             }
@@ -446,17 +446,17 @@ struct BinaryContigFunctor
             auto sg = ndit.get_sub_group();
             std::uint8_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     size_t offset = base + static_cast<size_t>(it) *
                                                static_cast<size_t>(sgSize);
                     auto in1_multi_ptr = sycl::address_space_cast<
@@ -480,20 +480,20 @@ struct BinaryContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     out[k] = op(in1[k], in2[k]);
                 }
             }
         }
         else {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
+            const size_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const size_t gid = ndit.get_global_linear_id();
+            const size_t elems_per_sg = sgSize * elems_per_wi;
 
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            const size_t base = (gid / sgSize) * elems_per_sg + (gid % sgSize);
             for (size_t offset = base;
-                 offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz));
+                 offset < std::min(nelems_, base + elems_per_sg);
                  offset += sgSize)
             {
                 out[offset] = op(in1[offset], in2[offset]);

From 123590255e987d29cbaf439da2e2841e5281fd6c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 24 Oct 2024 15:44:34 -0500
Subject: [PATCH 4/9] Change vec_sz and n_vecs settings for contiguous kernels

For short data types, each work-item may need to load several
elements to ensure that it uses all the data from cache-line.
For example, with simd32, we load 4 8-bit types (2 cache lines),
2 16-bit types, 1 32-bit and wider types.

n_vec is set to 1, to avoid cache thrashing due to second iteration
of some work-items beginning to access memory at higher addresses
while some work-items continue working on the lower addresses causing
cache evictions.

The size of the work-groups was increated from 128 to 256, which is
chosen so that all 8 threads of single vector with simd32 are used.
---
 .../kernels/elementwise_functions/abs.hpp     |  9 ++-
 .../kernels/elementwise_functions/acos.hpp    |  9 ++-
 .../kernels/elementwise_functions/acosh.hpp   |  9 ++-
 .../kernels/elementwise_functions/add.hpp     | 12 ++--
 .../kernels/elementwise_functions/angle.hpp   |  9 ++-
 .../kernels/elementwise_functions/asin.hpp    |  9 ++-
 .../kernels/elementwise_functions/asinh.hpp   |  9 ++-
 .../kernels/elementwise_functions/atan.hpp    |  9 ++-
 .../kernels/elementwise_functions/atan2.hpp   |  8 ++-
 .../kernels/elementwise_functions/atanh.hpp   |  9 ++-
 .../elementwise_functions/bitwise_and.hpp     | 12 ++--
 .../elementwise_functions/bitwise_invert.hpp  |  7 ++-
 .../bitwise_left_shift.hpp                    | 12 ++--
 .../elementwise_functions/bitwise_or.hpp      | 12 ++--
 .../bitwise_right_shift.hpp                   | 12 ++--
 .../elementwise_functions/bitwise_xor.hpp     | 12 ++--
 .../kernels/elementwise_functions/cbrt.hpp    |  8 ++-
 .../kernels/elementwise_functions/ceil.hpp    |  7 ++-
 .../kernels/elementwise_functions/common.hpp  | 56 +++++++++---------
 .../elementwise_functions/common_inplace.hpp  | 50 ++++++++--------
 .../kernels/elementwise_functions/conj.hpp    |  9 ++-
 .../elementwise_functions/copysign.hpp        |  8 ++-
 .../kernels/elementwise_functions/cos.hpp     |  9 ++-
 .../kernels/elementwise_functions/cosh.hpp    |  9 ++-
 .../kernels/elementwise_functions/equal.hpp   |  8 ++-
 .../kernels/elementwise_functions/exp.hpp     |  9 ++-
 .../kernels/elementwise_functions/exp2.hpp    |  9 ++-
 .../kernels/elementwise_functions/expm1.hpp   |  9 ++-
 .../kernels/elementwise_functions/floor.hpp   |  9 ++-
 .../elementwise_functions/floor_divide.hpp    | 12 ++--
 .../kernels/elementwise_functions/greater.hpp |  8 ++-
 .../elementwise_functions/greater_equal.hpp   |  8 ++-
 .../kernels/elementwise_functions/hypot.hpp   |  8 ++-
 .../kernels/elementwise_functions/imag.hpp    |  9 ++-
 .../elementwise_functions/isfinite.hpp        |  7 ++-
 .../kernels/elementwise_functions/isinf.hpp   |  8 ++-
 .../kernels/elementwise_functions/isnan.hpp   |  8 ++-
 .../kernels/elementwise_functions/less.hpp    | 10 +++-
 .../elementwise_functions/less_equal.hpp      |  8 ++-
 .../kernels/elementwise_functions/log.hpp     |  9 ++-
 .../kernels/elementwise_functions/log10.hpp   |  9 ++-
 .../kernels/elementwise_functions/log1p.hpp   |  9 ++-
 .../kernels/elementwise_functions/log2.hpp    |  9 ++-
 .../elementwise_functions/logaddexp.hpp       |  9 ++-
 .../elementwise_functions/logical_and.hpp     | 10 +++-
 .../elementwise_functions/logical_not.hpp     |  9 ++-
 .../elementwise_functions/logical_or.hpp      | 10 +++-
 .../elementwise_functions/logical_xor.hpp     | 10 +++-
 .../kernels/elementwise_functions/maximum.hpp | 10 +++-
 .../kernels/elementwise_functions/minimum.hpp | 10 +++-
 .../elementwise_functions/multiply.hpp        | 14 +++--
 .../elementwise_functions/negative.hpp        |  9 ++-
 .../elementwise_functions/nextafter.hpp       |  8 ++-
 .../elementwise_functions/not_equal.hpp       | 10 +++-
 .../elementwise_functions/positive.hpp        |  9 ++-
 .../kernels/elementwise_functions/pow.hpp     | 14 +++--
 .../kernels/elementwise_functions/proj.hpp    |  9 ++-
 .../kernels/elementwise_functions/real.hpp    |  9 ++-
 .../elementwise_functions/reciprocal.hpp      |  9 ++-
 .../elementwise_functions/remainder.hpp       | 14 +++--
 .../kernels/elementwise_functions/round.hpp   |  9 ++-
 .../kernels/elementwise_functions/rsqrt.hpp   | 10 +++-
 .../kernels/elementwise_functions/sign.hpp    |  9 ++-
 .../kernels/elementwise_functions/signbit.hpp |  8 ++-
 .../kernels/elementwise_functions/sin.hpp     |  9 ++-
 .../kernels/elementwise_functions/sinh.hpp    |  9 ++-
 .../kernels/elementwise_functions/sqrt.hpp    |  9 ++-
 .../kernels/elementwise_functions/square.hpp  |  8 ++-
 .../elementwise_functions/subtract.hpp        | 14 +++--
 .../kernels/elementwise_functions/tan.hpp     |  9 ++-
 .../kernels/elementwise_functions/tanh.hpp    |  9 ++-
 .../elementwise_functions/true_divide.hpp     | 14 +++--
 .../kernels/elementwise_functions/trunc.hpp   |  9 ++-
 .../elementwise_functions/vec_size_util.hpp   | 58 +++++++++++++++++++
 74 files changed, 573 insertions(+), 265 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 411040bada..220d31b687 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -32,9 +32,11 @@
 #include <type_traits>
 
 #include "cabs_impl.hpp"
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -50,6 +52,7 @@ namespace abs
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AbsFunctor
@@ -89,8 +92,8 @@ template <typename argT, typename resT> struct AbsFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AbsContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index a90f4e699f..f594c38e75 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace acos
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AcosFunctor
@@ -128,8 +131,8 @@ template <typename argT, typename resT> struct AcosFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AcosContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 8af3708427..60039d8b0d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace acosh
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AcoshFunctor
@@ -155,8 +158,8 @@ template <typename argT, typename resT> struct AcoshFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AcoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index c06e98f3e5..4ead81e1f6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -30,6 +30,8 @@
 #include <type_traits>
 
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -50,6 +52,8 @@ namespace add
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct AddFunctor
 {
 
@@ -110,8 +114,8 @@ template <typename argT1, typename argT2, typename resT> struct AddFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AddContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -410,8 +414,8 @@ template <typename argT, typename resT> struct AddInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index 034b71438f..4f36ef595f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace angle
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AngleFunctor
@@ -74,8 +77,8 @@ template <typename argT, typename resT> struct AngleFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AngleContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 35c381aa84..909dd151fd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace asin
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AsinFunctor
@@ -148,8 +151,8 @@ template <typename argT, typename resT> struct AsinFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AsinContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index 7373dc39d5..b7bc023db7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace asinh
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AsinhFunctor
@@ -131,8 +134,8 @@ template <typename argT, typename resT> struct AsinhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AsinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index fbba3fc436..5746ef66ac 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace atan
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AtanFunctor
@@ -138,8 +141,8 @@ template <typename argT, typename resT> struct AtanFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AtanContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index 1a694527dd..88953e9836 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace atan2
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct Atan2Functor
 {
 
@@ -68,8 +72,8 @@ template <typename argT1, typename argT2, typename resT> struct Atan2Functor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using Atan2ContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 340e72b11c..7cb5664c0b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace atanh
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct AtanhFunctor
@@ -132,8 +135,8 @@ template <typename argT, typename resT> struct AtanhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using AtanhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index da32b17183..70c0475a40 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -28,6 +28,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace bitwise_and
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct BitwiseAndFunctor
 {
@@ -91,8 +95,8 @@ struct BitwiseAndFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -290,8 +294,8 @@ template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseAndInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index d6c1bc72db..1d23e86ab7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace bitwise_invert
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::vec_cast;
 
 template <typename argT, typename resT> struct BitwiseInvertFunctor
@@ -80,8 +83,8 @@ template <typename argT, typename resT> struct BitwiseInvertFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseInvertContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index a987c8d604..f39c92af7e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,8 @@ namespace bitwise_left_shift
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct BitwiseLeftShiftFunctor
 {
@@ -100,8 +104,8 @@ struct BitwiseLeftShiftFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -304,8 +308,8 @@ template <typename argT, typename resT> struct BitwiseLeftShiftInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseLeftShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index 71f3e809d9..959be48395 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -28,6 +28,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace bitwise_or
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct BitwiseOrFunctor
 {
     static_assert(std::is_same_v<resT, argT1>);
@@ -90,8 +94,8 @@ template <typename argT1, typename argT2, typename resT> struct BitwiseOrFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -286,8 +290,8 @@ template <typename argT, typename resT> struct BitwiseOrInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseOrInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index e4dfee2ed6..8ecc5a5564 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,8 @@ namespace bitwise_right_shift
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct BitwiseRightShiftFunctor
 {
@@ -101,8 +105,8 @@ struct BitwiseRightShiftFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -308,8 +312,8 @@ template <typename argT, typename resT> struct BitwiseRightShiftInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseRightShiftInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index d035b31170..2356a9e470 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -28,6 +28,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace bitwise_xor
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct BitwiseXorFunctor
 {
@@ -91,8 +95,8 @@ struct BitwiseXorFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -290,8 +294,8 @@ template <typename argT, typename resT> struct BitwiseXorInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using BitwiseXorInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index 4f2634f17a..5892da7564 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/elementwise_functions/common.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
@@ -48,6 +50,8 @@ namespace cbrt
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT, typename resT> struct CbrtFunctor
 {
 
@@ -65,8 +69,8 @@ template <typename argT, typename resT> struct CbrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using CbrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
index 59bc630720..1d735695ba 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/elementwise_functions/common.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
@@ -47,6 +49,7 @@ namespace ceil
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct CeilFunctor
@@ -78,8 +81,8 @@ template <typename argT, typename resT> struct CeilFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using CeilContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 4c27fefa0f..31cebcf219 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -52,8 +52,8 @@ using dpctl::tensor::kernels::alignment_utils::required_alignment;
 template <typename argT,
           typename resT,
           typename UnaryOperatorT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 1,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 struct UnaryContigFunctor
 {
@@ -108,7 +108,7 @@ struct UnaryContigFunctor
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryOperatorT::supports_sg_loadstore::value &&
-                           UnaryOperatorT::supports_vec::value)
+                           UnaryOperatorT::supports_vec::value && (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
             std::uint32_t sgSize = sg.get_max_local_range()[0];
@@ -280,8 +280,8 @@ template <typename argTy,
           class ContigFunctorT,
           template <typename A, typename R, unsigned int vs, unsigned int nv>
           class kernel_name,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int vec_sz = 1,
+          unsigned int n_vecs = 1>
 sycl::event unary_contig_impl(sycl::queue &exec_q,
                               size_t nelems,
                               const char *arg_p,
@@ -291,7 +291,9 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        const size_t lws = 128;
+        // Choose work-group size to occupy all threads of since vector core
+        // busy (8 threads, simd32)
+        const size_t lws = 256;
         const size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
@@ -370,8 +372,8 @@ template <typename argT1,
           typename argT2,
           typename resT,
           typename BinaryOperatorT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 1,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 struct BinaryContigFunctor
 {
@@ -399,7 +401,7 @@ struct BinaryContigFunctor
 
         if constexpr (enable_sg_loadstore &&
                       BinaryOperatorT::supports_sg_loadstore::value &&
-                      BinaryOperatorT::supports_vec::value)
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
@@ -570,11 +572,11 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
-        auto sg = ndit.get_sub_group();
-        size_t gid = ndit.get_global_linear_id();
+        const auto &sg = ndit.get_sub_group();
+        const size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_max_local_range()[0];
-        size_t base = gid - sg.get_local_id()[0];
+        const size_t sgSize = sg.get_max_local_range()[0];
+        const size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
             auto in1_multi_ptr = sycl::address_space_cast<
@@ -597,9 +599,8 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
             sg.store(out_multi_ptr, res_el);
         }
         else {
-            for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
-                 k += sgSize)
-            {
+            const size_t lane_id = sg.get_local_id()[0];
+            for (size_t k = base + lane_id; k < n_elems; k += sgSize) {
                 res[k] = op(mat[k], padded_vec[k % n1]);
             }
         }
@@ -636,11 +637,11 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
         BinaryOperatorT op{};
         static_assert(BinaryOperatorT::supports_sg_loadstore::value);
 
-        auto sg = ndit.get_sub_group();
+        const auto &sg = ndit.get_sub_group();
         size_t gid = ndit.get_global_linear_id();
 
-        std::uint8_t sgSize = sg.get_max_local_range()[0];
-        size_t base = gid - sg.get_local_id()[0];
+        const size_t sgSize = sg.get_max_local_range()[0];
+        const size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
             auto in1_multi_ptr = sycl::address_space_cast<
@@ -663,9 +664,8 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
             sg.store(out_multi_ptr, res_el);
         }
         else {
-            for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
-                 k += sgSize)
-            {
+            const size_t lane_id = sg.get_local_id()[0];
+            for (size_t k = base + lane_id; k < n_elems; k += sgSize) {
                 res[k] = op(padded_vec[k % n1], mat[k]);
             }
         }
@@ -761,8 +761,8 @@ template <typename argTy1,
                     unsigned int vs,
                     unsigned int nv>
           class kernel_name,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int vec_sz = 1,
+          unsigned int n_vecs = 1>
 sycl::event binary_contig_impl(sycl::queue &exec_q,
                                size_t nelems,
                                const char *arg1_p,
@@ -776,7 +776,9 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        const size_t lws = 128;
+        // Choose work-group size to occupy all threads of since vector core
+        // busy (8 threads, simd32)
+        const size_t lws = 256;
         const size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
@@ -916,7 +918,7 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
     // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
     // ensure that reads are accessible
 
-    const size_t lws = 128;
+    const size_t lws = 256;
 
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(make_padded_vec_ev);
@@ -997,7 +999,7 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
     // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
     // ensure that reads are accessible
 
-    const size_t lws = 128;
+    const size_t lws = 256;
 
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(make_padded_vec_ev);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 8ece9ff910..f01468247e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -51,8 +51,8 @@ using dpctl::tensor::kernels::alignment_utils::required_alignment;
 template <typename argT,
           typename resT,
           typename BinaryInplaceOperatorT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 1,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 struct BinaryInplaceContigFunctor
 {
@@ -72,27 +72,29 @@ struct BinaryInplaceContigFunctor
     void operator()(sycl::nd_item<1> ndit) const
     {
         BinaryInplaceOperatorT op{};
+        constexpr std::uint32_t elems_per_wi = vec_sz * n_vecs;
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NB: Workgroup size must be divisible by sub-group size */
 
         if constexpr (enable_sg_loadstore &&
                       BinaryInplaceOperatorT::supports_sg_loadstore::value &&
-                      BinaryInplaceOperatorT::supports_vec::value)
+                      BinaryInplaceOperatorT::supports_vec::value &&
+                      (vec_sz > 1))
         {
             auto sg = ndit.get_sub_group();
             std::uint8_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
 
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     auto rhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
@@ -108,9 +110,8 @@ struct BinaryInplaceContigFunctor
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     op(lhs[k], rhs[k]);
                 }
             }
@@ -119,18 +120,18 @@ struct BinaryInplaceContigFunctor
                            BinaryInplaceOperatorT::supports_sg_loadstore::value)
         {
             auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_max_local_range()[0];
+            std::uint32_t sgSize = sg.get_max_local_range()[0];
 
-            size_t base = n_vecs * vec_sz *
+            size_t base = static_cast<size_t>(elems_per_wi) *
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * sgSize);
 
-            if (base + n_vecs * vec_sz * sgSize < nelems_) {
+            if (base + static_cast<size_t>(elems_per_wi * sgSize) < nelems_) {
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                for (std::uint32_t it = 0; it < elems_per_wi; it += vec_sz) {
                     auto rhs_multi_ptr = sycl::address_space_cast<
                         sycl::access::address_space::global_space,
                         sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
@@ -141,27 +142,27 @@ struct BinaryInplaceContigFunctor
                     arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
                     res_vec = sg.load<vec_sz>(lhs_multi_ptr);
 #pragma unroll
-                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                    for (std::uint32_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         op(res_vec[vec_id], arg_vec[vec_id]);
                     }
                     sg.store<vec_sz>(lhs_multi_ptr, res_vec);
                 }
             }
             else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize)
-                {
+                const size_t lane_id = sg.get_local_id()[0];
+                for (size_t k = base + lane_id; k < nelems_; k += sgSize) {
                     op(lhs[k], rhs[k]);
                 }
             }
         }
         else {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const size_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
+            const size_t elems_per_sg = elems_per_wi * sgSize;
 
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            base = (base / sgSize) * elems_per_sg + (base % sgSize);
             for (size_t offset = base;
-                 offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz));
+                 offset < std::min(nelems_, base + elems_per_sg);
                  offset += sgSize)
             {
                 op(lhs[offset], rhs[offset]);
@@ -303,8 +304,8 @@ template <typename argTy,
           class BinaryInplaceContigFunctorT,
           template <typename T1, typename T2, unsigned int vs, unsigned int nv>
           class kernel_name,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int vec_sz = 1,
+          unsigned int n_vecs = 1>
 sycl::event
 binary_inplace_contig_impl(sycl::queue &exec_q,
                            size_t nelems,
@@ -317,7 +318,8 @@ binary_inplace_contig_impl(sycl::queue &exec_q,
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        const size_t lws = 128;
+        // choose WG as n_threads_per_core * simd_width = 8 * 32
+        const size_t lws = 256;
         const size_t n_groups =
             ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
         const auto gws_range = sycl::range<1>(n_groups * lws);
@@ -438,7 +440,7 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
     // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
     // ensure that reads are accessible
 
-    const size_t lws = 128;
+    const size_t lws = 256;
 
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(make_padded_vec_ev);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index 4953feedb2..56f856f0c9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -50,6 +52,7 @@ namespace conj
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct ConjFunctor
@@ -82,8 +85,8 @@ template <typename argT, typename resT> struct ConjFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using ConjContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
index 92997b572b..464b907102 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace copysign
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
 {
 
@@ -82,8 +86,8 @@ template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using CopysignContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index 8b6b0c5fbe..bc844c18a1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace cos
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct CosFunctor
@@ -163,8 +166,8 @@ template <typename argT, typename resT> struct CosFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using CosContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index cff1038ed9..d2a7a3ba31 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace cosh
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct CoshFunctor
@@ -153,8 +156,8 @@ template <typename argT, typename resT> struct CoshFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using CoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index d368658afc..a78fd9190e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -30,6 +30,8 @@
 #include <type_traits>
 
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,8 @@ namespace equal
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct EqualFunctor
 {
     static_assert(std::is_same_v<resT, bool>);
@@ -119,8 +123,8 @@ template <typename argT1, typename argT2, typename resT> struct EqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using EqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
index 7e613c9731..af08591b10 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace exp
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct ExpFunctor
@@ -122,8 +125,8 @@ template <typename argT, typename resT> struct ExpFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using ExpContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
index b436bb3855..1fd87de509 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace exp2
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct Exp2Functor
@@ -124,8 +127,8 @@ template <typename argT, typename resT> struct Exp2Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using Exp2ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index 9a9d0a1562..a393bf95b4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -31,9 +31,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace expm1
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct Expm1Functor
@@ -136,8 +139,8 @@ template <typename argT, typename resT> struct Expm1Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using Expm1ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
index 530dd3d9aa..cd526d89a1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -29,9 +29,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -47,6 +49,7 @@ namespace floor
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct FloorFunctor
@@ -78,8 +81,8 @@ template <typename argT, typename resT> struct FloorFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using FloorContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 72ee3a789a..4f164ea1e2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,8 @@ namespace floor_divide
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct FloorDivideFunctor
 {
@@ -126,8 +130,8 @@ struct FloorDivideFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -367,8 +371,8 @@ template <typename argT, typename resT> struct FloorDivideInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using FloorDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index 05c2a36b0c..e1814110be 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
@@ -50,6 +52,8 @@ namespace greater
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct GreaterFunctor
 {
     static_assert(std::is_same_v<resT, bool>);
@@ -120,8 +124,8 @@ template <typename argT1, typename argT2, typename resT> struct GreaterFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using GreaterContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index 43e4e98db1..2929b1672f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
@@ -50,6 +52,8 @@ namespace greater_equal
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct GreaterEqualFunctor
 {
@@ -121,8 +125,8 @@ struct GreaterEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
index c5b68644a9..09c2055697 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace hypot
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct HypotFunctor
 {
 
@@ -84,8 +88,8 @@ template <typename argT1, typename argT2, typename resT> struct HypotFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using HypotContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
index e918bc0ac7..123aba9a9b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -31,9 +31,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace imag
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct ImagFunctor
@@ -78,8 +81,8 @@ template <typename argT, typename resT> struct ImagFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using ImagContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index df979eec76..50782c86cb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -45,6 +47,7 @@ namespace isfinite
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -97,8 +100,8 @@ template <typename argT, typename resT> struct IsFiniteFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using IsFiniteContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index 24be019a44..fa3247fefd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -30,7 +30,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -46,6 +49,7 @@ namespace isinf
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -96,8 +100,8 @@ template <typename argT, typename resT> struct IsInfFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using IsInfContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index cc452a25b1..b9d6f1be96 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -29,7 +29,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -45,6 +48,7 @@ namespace isnan
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -94,8 +98,8 @@ template <typename argT, typename resT> struct IsNanFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using IsNanContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index 0b26342563..cb70f52a19 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -49,6 +51,8 @@ namespace less
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct LessFunctor
 {
     static_assert(std::is_same_v<resT, bool>);
@@ -118,8 +122,8 @@ template <typename argT1, typename argT2, typename resT> struct LessFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LessContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index 01289ae98f..259b52c5ff 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -30,6 +30,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
@@ -49,6 +51,8 @@ namespace less_equal
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct LessEqualFunctor
 {
     static_assert(std::is_same_v<resT, bool>);
@@ -119,8 +123,8 @@ template <typename argT1, typename argT2, typename resT> struct LessEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
index a3e28ef5d7..bb32310cf6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace log
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct LogFunctor
@@ -78,8 +81,8 @@ template <typename argT, typename resT> struct LogFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LogContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
index 793b910f69..f897308ec3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -50,6 +52,7 @@ namespace log10
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -97,8 +100,8 @@ template <typename argT, typename resT> struct Log10Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using Log10ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
index 19238e7e37..e18078de5c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -30,9 +30,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace log1p
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 // TODO: evaluate precision against alternatives
@@ -102,8 +105,8 @@ template <typename argT, typename resT> struct Log1pFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using Log1pContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
index 69d0022c72..7f22115d7c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -50,6 +52,7 @@ namespace log2
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -98,8 +101,8 @@ template <typename argT, typename resT> struct Log2Functor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using Log2ContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index b0be45ea54..e39e87244e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -31,12 +31,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -51,6 +53,7 @@ namespace logaddexp
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -99,8 +102,8 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
index f15caa02e6..fde8cde51d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -49,6 +51,8 @@ namespace logical_and
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct LogicalAndFunctor
 {
@@ -93,8 +97,8 @@ struct LogicalAndFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
index 7c83e07072..3268b0cc33 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -30,7 +30,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -47,6 +50,8 @@ namespace logical_not
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT, typename resT> struct LogicalNotFunctor
 {
     static_assert(std::is_same_v<resT, bool>);
@@ -66,8 +71,8 @@ template <typename argT, typename resT> struct LogicalNotFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LogicalNotContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
index 43e02f2102..1fa5056da2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -49,6 +51,8 @@ namespace logical_or
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct LogicalOrFunctor
 {
     static_assert(std::is_same_v<resT, bool>);
@@ -92,8 +96,8 @@ template <typename argT1, typename argT2, typename resT> struct LogicalOrFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
index dc41760985..e4e56b40cd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -49,6 +51,8 @@ namespace logical_xor
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct LogicalXorFunctor
 {
@@ -94,8 +98,8 @@ struct LogicalXorFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index e73704bad8..5ca76925bf 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -49,6 +51,8 @@ namespace maximum
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
 {
 
@@ -96,8 +100,8 @@ template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using MaximumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 590c0b6486..2b47d09079 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -49,6 +51,8 @@ namespace minimum
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
 {
 
@@ -96,8 +100,8 @@ template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using MinimumContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index 1af284f55b..f0af4021fd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -30,12 +30,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -51,6 +53,8 @@ namespace multiply
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
 {
 
@@ -98,8 +102,8 @@ template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using MultiplyContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -402,8 +406,8 @@ template <typename argT, typename resT> struct MultiplyInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using MultiplyInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index 83f17dd47b..7e78ce73f3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -30,9 +30,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace negative
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -65,8 +68,8 @@ template <typename argT, typename resT> struct NegativeFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using NegativeContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
index 5dc9ea40b3..6f4e7f0bbd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -29,6 +29,8 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,8 @@ namespace nextafter
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct NextafterFunctor
 {
 
@@ -82,8 +86,8 @@ template <typename argT1, typename argT2, typename resT> struct NextafterFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using NextafterContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index c1b920193b..591d9e11a3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -29,11 +29,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -48,6 +50,8 @@ namespace not_equal
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct NotEqualFunctor
 {
     static_assert(std::is_same_v<resT, bool>);
@@ -103,8 +107,8 @@ template <typename argT1, typename argT2, typename resT> struct NotEqualFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using NotEqualContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index ae2711ed0e..1244a206f0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -30,9 +30,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace positive
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -80,8 +83,8 @@ template <typename argT, typename resT> struct PositiveFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using PositiveContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index bb462dceae..05e1e4e99a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -30,12 +30,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -51,6 +53,8 @@ namespace pow
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct PowFunctor
 {
 
@@ -151,8 +155,8 @@ template <typename argT1, typename argT2, typename resT> struct PowFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using PowContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -417,8 +421,8 @@ template <typename argT, typename resT> struct PowInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
index 2c3dce0c9c..a9812049fe 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -32,9 +32,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -50,6 +52,7 @@ namespace proj
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct ProjFunctor
@@ -91,8 +94,8 @@ template <typename argT, typename resT> struct ProjFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using ProjContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
index c66e4003cb..c7fb919ae7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -31,9 +31,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace real
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct RealFunctor
@@ -78,8 +81,8 @@ template <typename argT, typename resT> struct RealFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using RealContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
index 4d4b70fd4f..7310a15c09 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -32,12 +32,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 
 namespace dpctl
@@ -51,6 +53,7 @@ namespace reciprocal
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct ReciprocalFunctor
@@ -81,8 +84,8 @@ template <typename argT, typename resT> struct ReciprocalFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using ReciprocalContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 7bb070cc00..777e53e43f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -30,11 +30,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -50,6 +52,8 @@ namespace remainder
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct RemainderFunctor
 {
     static_assert(std::is_same_v<argT1, argT2>);
@@ -144,8 +148,8 @@ template <typename argT1, typename argT2, typename resT> struct RemainderFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -393,8 +397,8 @@ template <typename argT, typename resT> struct RemainderInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using RemainderInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index 241f75c1bb..af30711142 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -29,9 +29,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -47,6 +49,7 @@ namespace round
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct RoundFunctor
@@ -87,8 +90,8 @@ template <typename argT, typename resT> struct RoundFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using RoundContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index 61aafb13d9..5dac6bb30c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -33,9 +33,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -51,6 +53,8 @@ namespace rsqrt
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT, typename resT> struct RsqrtFunctor
 {
 
@@ -68,8 +72,8 @@ template <typename argT, typename resT> struct RsqrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using RsqrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index 651f7d5d9a..4bfaf33fdd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -31,9 +31,11 @@
 #include <type_traits>
 
 #include "cabs_impl.hpp"
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace sign
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -103,8 +106,8 @@ template <typename argT, typename resT> struct SignFunctor
 
 template <typename argT,
           typename resT = argT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SignContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index e8ac7709ad..ee42541631 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -30,7 +30,10 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "vec_size_util.hpp"
+
 #include "kernels/dpctl_tensor_types.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -46,6 +49,7 @@ namespace signbit
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -74,8 +78,8 @@ template <typename argT, typename resT> struct SignbitFunctor
 
 template <typename argT,
           typename resT = bool,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SignbitContigFunctor =
     elementwise_common::UnaryContigFunctor<argT,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index 8bc12097a8..e0225f3542 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace sin
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct SinFunctor
@@ -186,8 +189,8 @@ template <typename argT, typename resT> struct SinFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SinContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index e83626e56d..9912c54d51 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -29,10 +29,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -48,6 +50,7 @@ namespace sinh
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct SinhFunctor
@@ -155,8 +158,8 @@ template <typename argT, typename resT> struct SinhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 5adb41b20d..2d467d612f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -32,10 +32,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -51,6 +53,7 @@ namespace sqrt
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct SqrtFunctor
@@ -80,8 +83,8 @@ template <typename argT, typename resT> struct SqrtFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SqrtContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
index 4b096cc291..4d0db56313 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -97,8 +99,8 @@ template <typename argT, typename resT> struct SquareFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = 1,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SquareContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index 4ee3ae089b..e757e654ca 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -29,11 +29,13 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -49,6 +51,8 @@ namespace subtract
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT> struct SubtractFunctor
 {
 
@@ -85,8 +89,8 @@ template <typename argT1, typename argT2, typename resT> struct SubtractFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SubtractContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
@@ -401,8 +405,8 @@ template <typename argT, typename resT> struct SubtractInplaceFunctor
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SubtractInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index 4364d81fb7..6ef7a2449f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -30,10 +30,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -49,6 +51,7 @@ namespace tan
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct TanFunctor
@@ -130,8 +133,8 @@ template <typename argT, typename resT> struct TanFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using TanContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index 0af4e4e628..6c040b47e8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -31,10 +31,12 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -50,6 +52,7 @@ namespace tanh
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct TanhFunctor
@@ -124,8 +127,8 @@ template <typename argT, typename resT> struct TanhFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using TanhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 53db1e163c..495e380b79 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -29,12 +29,14 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/dpctl_tensor_types.hpp"
 #include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
 
+#include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/elementwise_functions/common.hpp"
 #include "kernels/elementwise_functions/common_inplace.hpp"
 
@@ -50,6 +52,8 @@ namespace true_divide
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
+
 template <typename argT1, typename argT2, typename resT>
 struct TrueDivideFunctor
 {
@@ -112,8 +116,8 @@ struct TrueDivideFunctor
 template <typename argT1,
           typename argT2,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT1, argT2, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor<
     argT1,
@@ -473,8 +477,8 @@ struct TrueDivideInplaceTypeMapFactory
 
 template <typename argT,
           typename resT,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argT, resT>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using TrueDivideInplaceContigFunctor =
     elementwise_common::BinaryInplaceContigFunctor<
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
index 55c8493880..a0ca221837 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -29,9 +29,11 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
-#include "kernels/elementwise_functions/common.hpp"
+#include "vec_size_util.hpp"
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 #include "utils/type_utils.hpp"
@@ -47,6 +49,7 @@ namespace trunc
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 
 template <typename argT, typename resT> struct TruncFunctor
@@ -75,8 +78,8 @@ template <typename argT, typename resT> struct TruncFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
+          unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using TruncContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
new file mode 100644
index 0000000000..1075239eb3
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -0,0 +1,58 @@
+//=== tanh.hpp -   Unary function TANH                     ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines utilities for dispatching elementwise dedicated kernels
+//  for contiguous inputs.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace vec_size_utils
+{
+
+template <typename T, typename... Rest> struct VecSize
+{
+    static constexpr unsigned int value =
+        std::max<unsigned int>(VecSize<T>::value, VecSize<Rest...>::value);
+};
+
+template <typename T> struct VecSize<T>
+{
+    static constexpr unsigned int value =
+        1 + ((sizeof(std::uint32_t) - 1) / (sizeof(T)));
+};
+
+template <typename T, typename... Rest>
+static constexpr unsigned int VecSize_v = VecSize<T, Rest...>::value;
+
+} // end of namespace vec_size_utils
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl

From 4a224079e18089e555c7cb8af07001a8af61d6b3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 25 Oct 2024 09:19:53 -0500
Subject: [PATCH 5/9] Set vec_sz and n_vecs in implementations of contig_impl
 for each support function

---
 .../kernels/elementwise_functions/abs.hpp      |  8 ++++++--
 .../kernels/elementwise_functions/acos.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/acosh.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/add.hpp      | 18 +++++++++++++-----
 .../kernels/elementwise_functions/angle.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/asin.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/asinh.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/atan.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/atan2.hpp    |  9 +++++++--
 .../kernels/elementwise_functions/atanh.hpp    |  8 ++++++--
 .../elementwise_functions/bitwise_and.hpp      | 16 ++++++++++++----
 .../elementwise_functions/bitwise_invert.hpp   | 12 ++++++++----
 .../bitwise_left_shift.hpp                     | 16 ++++++++++++----
 .../elementwise_functions/bitwise_or.hpp       | 16 ++++++++++++----
 .../bitwise_right_shift.hpp                    | 16 ++++++++++++----
 .../elementwise_functions/bitwise_xor.hpp      | 16 ++++++++++++----
 .../kernels/elementwise_functions/cbrt.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/ceil.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/conj.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/copysign.hpp |  9 +++++++--
 .../kernels/elementwise_functions/cos.hpp      |  8 ++++++--
 .../kernels/elementwise_functions/cosh.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/equal.hpp    |  9 +++++++--
 .../kernels/elementwise_functions/exp.hpp      |  8 ++++++--
 .../kernels/elementwise_functions/exp2.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/expm1.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/floor.hpp    |  8 ++++++--
 .../elementwise_functions/floor_divide.hpp     | 15 +++++++++++----
 .../kernels/elementwise_functions/greater.hpp  |  9 +++++++--
 .../elementwise_functions/greater_equal.hpp    | 10 +++++++---
 .../kernels/elementwise_functions/hypot.hpp    |  9 +++++++--
 .../kernels/elementwise_functions/imag.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/isfinite.hpp | 12 ++++++++----
 .../kernels/elementwise_functions/isinf.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/isnan.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/less.hpp     | 10 +++++++---
 .../elementwise_functions/less_equal.hpp       |  9 +++++++--
 .../kernels/elementwise_functions/log.hpp      |  8 ++++++--
 .../kernels/elementwise_functions/log10.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/log1p.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/log2.hpp     |  8 ++++++--
 .../elementwise_functions/logaddexp.hpp        |  9 +++++++--
 .../elementwise_functions/logical_and.hpp      |  9 +++++++--
 .../elementwise_functions/logical_not.hpp      | 12 ++++++++----
 .../elementwise_functions/logical_or.hpp       |  9 +++++++--
 .../elementwise_functions/logical_xor.hpp      |  9 +++++++--
 .../kernels/elementwise_functions/maximum.hpp  |  9 +++++++--
 .../kernels/elementwise_functions/minimum.hpp  |  9 +++++++--
 .../kernels/elementwise_functions/multiply.hpp | 16 ++++++++++++----
 .../kernels/elementwise_functions/negative.hpp | 12 ++++++++----
 .../elementwise_functions/nextafter.hpp        |  9 +++++++--
 .../elementwise_functions/not_equal.hpp        |  9 +++++++--
 .../kernels/elementwise_functions/positive.hpp | 12 ++++++++----
 .../kernels/elementwise_functions/pow.hpp      | 18 +++++++++++++-----
 .../kernels/elementwise_functions/proj.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/real.hpp     |  8 ++++++--
 .../elementwise_functions/reciprocal.hpp       | 12 ++++++++----
 .../elementwise_functions/remainder.hpp        | 16 ++++++++++++----
 .../kernels/elementwise_functions/round.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/rsqrt.hpp    |  8 ++++++--
 .../kernels/elementwise_functions/sign.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/signbit.hpp  |  8 ++++++--
 .../kernels/elementwise_functions/sin.hpp      |  8 ++++++--
 .../kernels/elementwise_functions/sinh.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/sqrt.hpp     |  8 ++++++--
 .../kernels/elementwise_functions/square.hpp   | 11 ++++++++---
 .../kernels/elementwise_functions/subtract.hpp | 16 ++++++++++++----
 .../kernels/elementwise_functions/tan.hpp      |  8 ++++++--
 .../kernels/elementwise_functions/tanh.hpp     |  8 ++++++--
 .../elementwise_functions/true_divide.hpp      | 16 ++++++++++++----
 .../kernels/elementwise_functions/trunc.hpp    |  8 ++++++--
 71 files changed, 529 insertions(+), 183 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 220d31b687..b48f2b5704 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -135,9 +135,13 @@ sycl::event abs_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AbsOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AbsContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index f594c38e75..208f37ca85 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -169,9 +169,13 @@ sycl::event acos_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AcosOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AcosContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 60039d8b0d..d19d462816 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -196,9 +196,13 @@ sycl::event acosh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AcoshOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AcoshContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 4ead81e1f6..695b8985ee 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -218,10 +218,14 @@ sycl::event add_contig_impl(sycl::queue &exec_q,
                             ssize_t res_offset,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AddOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1;
+
     return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddContigFactory
@@ -493,9 +497,13 @@ add_inplace_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+        argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddInplaceContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
index 4f36ef595f..3812159beb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -112,9 +112,13 @@ sycl::event angle_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AngleOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AngleContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 909dd151fd..40ad6f980d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -189,9 +189,13 @@ sycl::event asin_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AsinOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AsinContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index b7bc023db7..9eaee84b1c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -172,9 +172,13 @@ sycl::event asinh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AsinhOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AsinhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index 5746ef66ac..ccff50844f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -179,9 +179,13 @@ sycl::event atan_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AtanOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AtanContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index 88953e9836..ec03db1092 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -125,10 +125,15 @@ sycl::event atan2_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename Atan2OutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor,
-        atan2_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                             arg2_offset, res_p, res_offset, depends);
+        atan2_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct Atan2ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 7cb5664c0b..d91a75b25d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -173,9 +173,13 @@ sycl::event atanh_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename AtanhOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AtanhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index 70c0475a40..b16e54c062 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -183,10 +183,15 @@ bitwise_and_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename BitwiseAndOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor,
-        bitwise_and_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        bitwise_and_contig_kernel, vec_sz, n_vec>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct BitwiseAndContigFactory
@@ -365,10 +370,13 @@ bitwise_and_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseAndInplaceContigFunctor,
-        bitwise_and_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                           res_p, res_offset, depends);
+        bitwise_and_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index 1d23e86ab7..a795cce741 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -129,10 +129,14 @@ bitwise_invert_contig_impl(sycl::queue &exec_q,
                            char *res_p,
                            const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, BitwiseInvertOutputType,
-                                                 BitwiseInvertContigFunctor,
-                                                 bitwise_invert_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    using resTy = typename BitwiseInvertOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vec = 1u;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor,
+        bitwise_invert_contig_kernel, vec_sz, n_vec>(exec_q, nelems, arg_p,
+                                                     res_p, depends);
 }
 
 template <typename fnT, typename T> struct BitwiseInvertContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index f39c92af7e..1717892515 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -192,11 +192,16 @@ bitwise_left_shift_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
+    using resTy =
+        typename BitwiseLeftShiftOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseLeftShiftOutputType,
-        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+                res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -379,9 +384,12 @@ sycl::event bitwise_left_shift_inplace_contig_impl(
     ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseLeftShiftInplaceContigFunctor,
-        bitwise_left_shift_inplace_contig_kernel>(
+        bitwise_left_shift_inplace_contig_kernel, vec_sz, n_vecs>(
         exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index 959be48395..664ce9ff50 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -181,10 +181,15 @@ sycl::event bitwise_or_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename BitwiseOrOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor,
-        bitwise_or_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                  arg2_offset, res_p, res_offset, depends);
+        bitwise_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct BitwiseOrContigFactory
@@ -359,10 +364,13 @@ bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseOrInplaceContigFunctor,
-        bitwise_or_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                          res_p, res_offset, depends);
+        bitwise_or_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 8ecc5a5564..7adc2a9eb6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -194,11 +194,16 @@ bitwise_right_shift_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    using resTy =
+        typename BitwiseRightShiftOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseRightShiftOutputType,
-        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -383,9 +388,12 @@ sycl::event bitwise_right_shift_inplace_contig_impl(
     ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseRightShiftInplaceContigFunctor,
-        bitwise_right_shift_inplace_contig_kernel>(
+        bitwise_right_shift_inplace_contig_kernel, vec_sz, n_vecs>(
         exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index 2356a9e470..6c77298052 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -183,10 +183,15 @@ bitwise_xor_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename BitwiseXorOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor,
-        bitwise_xor_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        bitwise_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct BitwiseXorContigFactory
@@ -365,10 +370,13 @@ bitwise_xor_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, BitwiseXorInplaceContigFunctor,
-        bitwise_xor_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                           res_p, res_offset, depends);
+        bitwise_xor_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index 5892da7564..4a4aa1b275 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -105,9 +105,13 @@ sycl::event cbrt_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename CbrtOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CbrtContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
index 1d735695ba..aa41574fbb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -125,9 +125,13 @@ sycl::event ceil_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename CeilOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CeilContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index 56f856f0c9..38e66b3f77 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -132,9 +132,13 @@ sycl::event conj_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename ConjOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ConjContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
index 464b907102..600c010ab7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -139,10 +139,15 @@ sycl::event copysign_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename CopysignOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
-        copysign_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                arg2_offset, res_p, res_offset, depends);
+        copysign_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct CopysignContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index bc844c18a1..979ac94d8b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -205,9 +205,13 @@ sycl::event cos_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename CosOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CosContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index d2a7a3ba31..6c424a0a32 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -194,9 +194,13 @@ sycl::event cosh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename CoshOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CoshContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index a78fd9190e..6e2ed021c8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -211,10 +211,15 @@ sycl::event equal_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename EqualOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, EqualOutputType, EqualContigFunctor,
-        equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                             arg2_offset, res_p, res_offset, depends);
+        equal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct EqualContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
index af08591b10..b671c360da 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -163,9 +163,13 @@ sycl::event exp_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename ExpOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ExpContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
index 1fd87de509..a94fe67e5a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -165,9 +165,13 @@ sycl::event exp2_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename Exp2OutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Exp2ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index a393bf95b4..3e9ed4dd12 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -178,9 +178,13 @@ sycl::event expm1_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename Expm1OutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Expm1ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
index cd526d89a1..41479463f8 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -125,9 +125,13 @@ sycl::event floor_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename FloorOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct FloorContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 4f164ea1e2..705db584a5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -224,10 +224,15 @@ floor_divide_contig_impl(sycl::queue &exec_q,
                          ssize_t res_offset,
                          const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename FloorDivideOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor,
-        floor_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                    arg2_offset, res_p, res_offset, depends);
+        floor_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -444,10 +449,12 @@ floor_divide_inplace_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1;
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, FloorDivideInplaceContigFunctor,
-        floor_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                            res_p, res_offset, depends);
+        floor_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index e1814110be..90c6b23c84 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -212,10 +212,15 @@ sycl::event greater_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename GreaterOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, GreaterOutputType, GreaterContigFunctor,
-        greater_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                               arg2_offset, res_p, res_offset, depends);
+        greater_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct GreaterContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index 2929b1672f..e5a49b40dc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -214,11 +214,15 @@ greater_equal_contig_impl(sycl::queue &exec_q,
                           ssize_t res_offset,
                           const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename GreaterEqualOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor,
-        greater_equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset,
-                                     arg2_p, arg2_offset, res_p, res_offset,
-                                     depends);
+        greater_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
index 09c2055697..af72b92c37 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -141,10 +141,15 @@ sycl::event hypot_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename HypotOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, HypotOutputType, HypotContigFunctor,
-        hypot_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                             arg2_offset, res_p, res_offset, depends);
+        hypot_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct HypotContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
index 123aba9a9b..64f0b7ac9c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -128,9 +128,13 @@ sycl::event imag_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename ImagOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ImagContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index 50782c86cb..762ceb980e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -130,10 +130,14 @@ sycl::event isfinite_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, IsFiniteOutputType,
-                                                 IsFiniteContigFunctor,
-                                                 isfinite_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    using resTy = bool;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsFiniteOutputType, IsFiniteContigFunctor,
+        isfinite_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
 }
 
 template <typename fnT, typename T> struct IsFiniteContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index fa3247fefd..d7d00a4bf0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -130,9 +130,13 @@ sycl::event isinf_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = bool;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct IsInfContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index b9d6f1be96..d3b65e3bad 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -128,9 +128,13 @@ sycl::event isnan_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = bool;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct IsNanContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index cb70f52a19..4eead62f92 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -210,10 +210,14 @@ sycl::event less_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename LessOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LessContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index 259b52c5ff..1e8486b74c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -211,10 +211,15 @@ sycl::event less_equal_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename LessEqualOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor,
-        less_equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                  arg2_offset, res_p, res_offset, depends);
+        less_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LessEqualContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
index bb32310cf6..bbd6a80c23 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -120,9 +120,13 @@ sycl::event log_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename LogOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, LogOutputType, LogContigFunctor, log_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct LogContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
index f897308ec3..8a1aae5eb0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -139,9 +139,13 @@ sycl::event log10_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename Log10OutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Log10ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
index e18078de5c..5cf5f3d12b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -144,9 +144,13 @@ sycl::event log1p_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename Log1pOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Log1pContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
index 7f22115d7c..4d04f43862 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -140,9 +140,13 @@ sycl::event log2_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename Log2OutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct Log2ContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index e39e87244e..593790569a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -155,10 +155,15 @@ sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename LogAddExpOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
-        logaddexp_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        logaddexp_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogAddExpContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
index fde8cde51d..a3e577df5b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -182,10 +182,15 @@ logical_and_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename LogicalAndOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor,
-        logical_and_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        logical_and_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogicalAndContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
index 3268b0cc33..afa7748968 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -105,10 +105,14 @@ logical_not_contig_impl(sycl::queue &exec_q,
                         char *res_p,
                         const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, LogicalNotOutputType,
-                                                 LogicalNotContigFunctor,
-                                                 logical_not_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    using resTy = typename LogicalNotOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, LogicalNotOutputType, LogicalNotContigFunctor,
+        logical_not_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                   depends);
 }
 
 template <typename fnT, typename T> struct LogicalNotContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
index 1fa5056da2..c873e332ea 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -180,10 +180,15 @@ sycl::event logical_or_contig_impl(sycl::queue &exec_q,
                                    ssize_t res_offset,
                                    const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename LogicalOrOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor,
-        logical_or_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                  arg2_offset, res_p, res_offset, depends);
+        logical_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogicalOrContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
index e4e56b40cd..900c432d33 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -183,10 +183,15 @@ logical_xor_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename LogicalXorOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor,
-        logical_xor_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        logical_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct LogicalXorContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index 5ca76925bf..d377e62d93 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -204,10 +204,15 @@ sycl::event maximum_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename MaximumOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MaximumOutputType, MaximumContigFunctor,
-        maximum_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                               arg2_offset, res_p, res_offset, depends);
+        maximum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct MaximumContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 2b47d09079..b389bb81e4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -204,10 +204,15 @@ sycl::event minimum_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename MinimumOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MinimumOutputType, MinimumContigFunctor,
-        minimum_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                               arg2_offset, res_p, res_offset, depends);
+        minimum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct MinimumContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index f0af4021fd..8a3c5ca0e3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -206,10 +206,15 @@ sycl::event multiply_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename MultiplyOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
-        multiply_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                arg2_offset, res_p, res_offset, depends);
+        multiply_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct MultiplyContigFactory
@@ -486,10 +491,13 @@ multiply_inplace_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, MultiplyInplaceContigFunctor,
-        multiply_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                        res_p, res_offset, depends);
+        multiply_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index 7e78ce73f3..8e868957a1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -110,10 +110,14 @@ sycl::event negative_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, NegativeOutputType,
-                                                 NegativeContigFunctor,
-                                                 negative_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    using resTy = typename NegativeOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, NegativeOutputType, NegativeContigFunctor,
+        negative_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
 }
 
 template <typename fnT, typename T> struct NegativeContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
index 6f4e7f0bbd..5c78ad0199 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -139,10 +139,15 @@ sycl::event nextafter_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename NextafterOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, NextafterOutputType, NextafterContigFunctor,
-        nextafter_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        nextafter_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct NextafterContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index 591d9e11a3..7975074a69 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -195,10 +195,15 @@ sycl::event not_equal_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename NotEqualOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor,
-        not_equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        not_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct NotEqualContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index 1244a206f0..c904afead9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -125,10 +125,14 @@ sycl::event positive_contig_impl(sycl::queue &exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, PositiveOutputType,
-                                                 PositiveContigFunctor,
-                                                 positive_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    using resTy = typename PositiveOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, PositiveOutputType, PositiveContigFunctor,
+        positive_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
 }
 
 template <typename fnT, typename T> struct PositiveContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index 05e1e4e99a..998c24f62b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -258,10 +258,14 @@ sycl::event pow_contig_impl(sycl::queue &exec_q,
                             ssize_t res_offset,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename PowOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
+        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct PowContigFactory
@@ -499,9 +503,13 @@ pow_inplace_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct PowInplaceContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
index a9812049fe..7662de1388 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -129,9 +129,13 @@ sycl::event proj_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename ProjOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct ProjContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
index c7fb919ae7..ee1e16f269 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -128,9 +128,13 @@ sycl::event real_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename RealOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, RealOutputType, RealContigFunctor, real_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct RealContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
index 7310a15c09..1ef2af687e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -125,10 +125,14 @@ sycl::event reciprocal_contig_impl(sycl::queue &exec_q,
                                    char *res_p,
                                    const std::vector<sycl::event> &depends = {})
 {
-    return elementwise_common::unary_contig_impl<argTy, ReciprocalOutputType,
-                                                 ReciprocalContigFunctor,
-                                                 reciprocal_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+    using resTy = typename ReciprocalOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ReciprocalOutputType, ReciprocalContigFunctor,
+        reciprocal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                  depends);
 }
 
 template <typename fnT, typename T> struct ReciprocalContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 777e53e43f..f881109751 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -241,10 +241,15 @@ sycl::event remainder_contig_impl(sycl::queue &exec_q,
                                   ssize_t res_offset,
                                   const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename RemainderOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, RemainderOutputType, RemainderContigFunctor,
-        remainder_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                 arg2_offset, res_p, res_offset, depends);
+        remainder_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct RemainderContigFactory
@@ -468,10 +473,13 @@ remainder_inplace_contig_impl(sycl::queue &exec_q,
                               ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, RemainderInplaceContigFunctor,
-        remainder_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                         res_p, res_offset, depends);
+        remainder_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index af30711142..4e1d24a8bc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -136,9 +136,13 @@ sycl::event round_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename RoundOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct RoundContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index 5dac6bb30c..fb23f250b9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -108,9 +108,13 @@ sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename RsqrtOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct RsqrtContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index 4bfaf33fdd..a5fe5bdf10 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -148,9 +148,13 @@ sycl::event sign_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename SignOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SignOutputType, SignContigFunctor, sign_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SignContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index ee42541631..1950b34f27 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -114,9 +114,13 @@ sycl::event signbit_contig_impl(sycl::queue &exec_q,
                                 char *res_p,
                                 const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename SignbitOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SignbitContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index e0225f3542..3f02759046 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -227,9 +227,13 @@ sycl::event sin_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename SinOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SinOutputType, SinContigFunctor, sin_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SinContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index 9912c54d51..d96ef1a7c9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -196,9 +196,13 @@ sycl::event sinh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename SinhOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SinhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 2d467d612f..048eec54e2 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -122,9 +122,13 @@ sycl::event sqrt_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename SqrtOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SqrtContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
index 4d0db56313..b2093bfba7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -51,6 +51,7 @@ namespace square
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
+using dpctl::tensor::kernels::vec_size_utils::VecSize_v;
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
 
@@ -99,7 +100,7 @@ template <typename argT, typename resT> struct SquareFunctor
 
 template <typename argTy,
           typename resTy = argTy,
-          unsigned int vec_sz = 1,
+          unsigned int vec_sz = VecSize_v<argTy, resTy>,
           unsigned int n_vecs = 1,
           bool enable_sg_loadstore = true>
 using SquareContigFunctor =
@@ -146,9 +147,13 @@ sycl::event square_contig_impl(sycl::queue &exec_q,
                                char *res_p,
                                const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename SquareOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SquareContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index e757e654ca..f3c370c9ca 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -192,10 +192,15 @@ sycl::event subtract_contig_impl(sycl::queue &exec_q,
                                  ssize_t res_offset,
                                  const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename SubtractOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
-        subtract_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                arg2_offset, res_p, res_offset, depends);
+        subtract_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct SubtractContigFactory
@@ -484,10 +489,13 @@ subtract_inplace_contig_impl(sycl::queue &exec_q,
                              ssize_t res_offset,
                              const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, SubtractInplaceContigFunctor,
-        subtract_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                        res_p, res_offset, depends);
+        subtract_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index 6ef7a2449f..0bf21e1d8c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -171,9 +171,13 @@ sycl::event tan_contig_impl(sycl::queue &exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename TanOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, TanOutputType, TanContigFunctor, tan_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct TanContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index 6c040b47e8..64bfcf504d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -165,9 +165,13 @@ sycl::event tanh_contig_impl(sycl::queue &exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename TanhOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct TanhContigFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 495e380b79..d63ca45b37 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -200,10 +200,15 @@ true_divide_contig_impl(sycl::queue &exec_q,
                         ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename TrueDivideOutputType<argTy1, argTy2>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy1, argTy2, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_contig_impl<
         argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor,
-        true_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                                   arg2_offset, res_p, res_offset, depends);
+        true_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct TrueDivideContigFactory
@@ -513,10 +518,13 @@ true_divide_inplace_contig_impl(sycl::queue &exec_q,
                                 ssize_t res_offset,
                                 const std::vector<sycl::event> &depends = {})
 {
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::binary_inplace_contig_impl<
         argTy, resTy, TrueDivideInplaceContigFunctor,
-        true_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset,
-                                           res_p, res_offset, depends);
+        true_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
index a0ca221837..5728c40b7b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -122,9 +122,13 @@ sycl::event trunc_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
+    using resTy = typename TruncOutputType<argTy>::value_type;
+    constexpr auto vec_sz = VecSize_v<argTy, resTy>;
+    constexpr unsigned int n_vecs = 1u;
+
     return elementwise_common::unary_contig_impl<
-        argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel>(
-        exec_q, nelems, arg_p, res_p, depends);
+        argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct TruncContigFactory

From 581e8bdaeb63a1a0c65cc715a0d744b6d0b56309 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 25 Oct 2024 15:45:26 -0500
Subject: [PATCH 6/9] Add static assert

---
 .../include/kernels/elementwise_functions/vec_size_util.hpp     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
index 1075239eb3..b85c8911dc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -45,6 +45,8 @@ template <typename T, typename... Rest> struct VecSize
 
 template <typename T> struct VecSize<T>
 {
+    static_assert(sizeof(T) > 0, "Vacuous types are not supported");
+
     static constexpr unsigned int value =
         1 + ((sizeof(std::uint32_t) - 1) / (sizeof(T)));
 };

From 63c82fc76296a40be36617f12b8159c6b672831b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 25 Oct 2024 15:50:01 -0500
Subject: [PATCH 7/9] Add entry to changelog for improved performance of
 elementwise functions

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 69641d5e05..32c29edffd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * Improved performance of copying operation to C-/F-contig array, with optimization for batch of square matrices [gh-1850](https://github.com/IntelPython/dpctl/pull/1850)
 * Improved performance of `tensor.argsort` function for all types [gh-1859](https://github.com/IntelPython/dpctl/pull/1859)
 * Improved performance of `tensor.sort` and `tensor.argsort` for short arrays in the range [16, 64] elements [gh-1866](https://github.com/IntelPython/dpctl/pull/1866)
+* Improved pefrormance of element-wise unary and binary functions [gh-1879](https://github.com/IntelPython/dpctl/pull/1879)
 
 ### Fixed
 * Fix for `tensor.result_type` when all inputs are Python built-in scalars [gh-1877](https://github.com/IntelPython/dpctl/pull/1877)

From cd783e07e5548dd3adf61ed95ebb8b4c60ee20eb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Oct 2024 21:58:20 -0500
Subject: [PATCH 8/9] Fix for test failure on AMD CPU.

vec operator should also apply isnan for sycl::half
---
 .../kernels/elementwise_functions/maximum.hpp | 24 +++++++++++++------
 .../kernels/elementwise_functions/minimum.hpp | 24 +++++++++++++------
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index d377e62d93..5231e98682 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -74,9 +74,13 @@ template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
                            std::is_same_v<argT1, sycl::half>)
-            return (std::isnan(in1) || in1 > in2) ? in1 : in2;
-        else
+        {
+            const bool choose_first = (std::isnan(in1) || (in1 > in2));
+            return (choose_first) ? in1 : in2;
+        }
+        else {
             return (in1 > in2) ? in1 : in2;
+        }
     }
 
     template <int vec_sz>
@@ -87,11 +91,17 @@ template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
         sycl::vec<resT, vec_sz> res;
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            if constexpr (std::is_floating_point_v<argT1>)
-                res[i] =
-                    (sycl::isnan(in1[i]) || in1[i] > in2[i]) ? in1[i] : in2[i];
-            else
-                res[i] = (in1[i] > in2[i]) ? in1[i] : in2[i];
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = (std::isnan(v1) || (v1 > v2));
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 > v2) ? v1 : v2;
+            }
         }
         return res;
     }
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index b389bb81e4..24bcd9d1bc 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -74,9 +74,13 @@ template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
                            std::is_same_v<argT1, sycl::half>)
-            return (std::isnan(in1) || in1 < in2) ? in1 : in2;
-        else
+        {
+            const bool choose_first = sycl::isnan(in1) || (in1 < in2);
+            return (choose_first) ? in1 : in2;
+        }
+        else {
             return (in1 < in2) ? in1 : in2;
+        }
     }
 
     template <int vec_sz>
@@ -87,11 +91,17 @@ template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
         sycl::vec<resT, vec_sz> res;
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            if constexpr (std::is_floating_point_v<argT1>)
-                res[i] =
-                    (sycl::isnan(in1[i]) || in1[i] < in2[i]) ? in1[i] : in2[i];
-            else
-                res[i] = (in1[i] < in2[i]) ? in1[i] : in2[i];
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>)
+            {
+                const bool choose_first = sycl::isnan(v1) || (v1 < v2);
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 < v2) ? v1 : v2;
+            }
         }
         return res;
     }

From cef4359415d31166ab0a5434e2c4e1ee66064f6b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 28 Oct 2024 16:13:58 -0500
Subject: [PATCH 9/9] Make local work-groups size dependent on number of
 elements to process

---
 .../kernels/elementwise_functions/common.hpp  | 88 +++++++++++--------
 1 file changed, 53 insertions(+), 35 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 31cebcf219..17ab3b083f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -269,6 +269,23 @@ struct UnaryStridedFunctor
     }
 };
 
+template <typename SizeT>
+SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
+{
+    // TODO: make the decision based on device descriptors
+
+    constexpr SizeT few_threshold = (SizeT(1) << 17);
+    constexpr SizeT med_threshold = (SizeT(1) << 21);
+
+    const SizeT lws =
+        ((n_work_items_needed <= few_threshold)
+             ? SizeT(64)
+             : (n_work_items_needed <= med_threshold ? SizeT(128)
+                                                     : SizeT(256)));
+
+    return lws;
+}
+
 template <typename argTy,
           template <typename T>
           class UnaryOutputType,
@@ -288,26 +305,28 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
+    const size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    const size_t lws = select_lws(exec_q.get_device(), n_work_items_needed);
 
-        // Choose work-group size to occupy all threads of since vector core
-        // busy (8 threads, simd32)
-        const size_t lws = 256;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
+    const size_t n_groups =
+        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
 
-        using resTy = typename UnaryOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+    using resTy = typename UnaryOutputType<argTy>::value_type;
+    using BaseKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
         if (is_aligned<required_alignment>(arg_p) &&
             is_aligned<required_alignment>(res_p))
         {
             constexpr bool enable_sg_loadstore = true;
-            using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using KernelName = BaseKernelName;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
@@ -316,9 +335,8 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
         }
         else {
             constexpr bool disable_sg_loadstore = false;
-            using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
             using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
@@ -326,6 +344,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
                                disable_sg_loadstore>(arg_tp, res_tp, nelems));
         }
     });
+
     return comp_ev;
 }
 
@@ -773,32 +792,33 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
                                ssize_t res_offset,
                                const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
+    const size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    const size_t lws = select_lws(exec_q.get_device(), n_work_items_needed);
 
-        // Choose work-group size to occupy all threads of since vector core
-        // busy (8 threads, simd32)
-        const size_t lws = 256;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
+    const size_t n_groups =
+        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
 
-        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+    using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+    using BaseKernelName = kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+
+    const argTy1 *arg1_tp =
+        reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
+    const argTy2 *arg2_tp =
+        reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
 
-        const argTy1 *arg1_tp =
-            reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
-        const argTy2 *arg2_tp =
-            reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
         if (is_aligned<required_alignment>(arg1_tp) &&
             is_aligned<required_alignment>(arg2_tp) &&
             is_aligned<required_alignment>(res_tp))
         {
             constexpr bool enable_sg_loadstore = true;
-            using KernelName =
-                kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+            using KernelName = BaseKernelName;
+
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
                 BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
@@ -807,10 +827,8 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
         }
         else {
             constexpr bool disable_sg_loadstore = false;
-            using InnerKernelName =
-                kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
             using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
                 BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,