IntelPython · oleksandr-pavlyk · Dec 9, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023
@@ -0,0 +1,45 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace alignment_utils
+{
+
+static constexpr size_t required_alignment = 64;
+
+template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
+{
+    return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
+}
+
+template <typename KernelName> class disabled_sg_loadstore_wrapper_krn;
+
+} // end of namespace alignment_utils
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
@@ -32,6 +32,7 @@
 #include <pybind11/pybind11.h>
 #include <type_traits>
 
+#include "kernels/alignment.hpp"
 #include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_dispatch.hpp"
@@ -51,6 +52,11 @@ namespace td_ns = dpctl::tensor::type_dispatch;
 
 using namespace dpctl::tensor::offset_utils;
 
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 template <typename T> T clip(const T &x, const T &min, const T &max)
 {
     using dpctl::tensor::type_utils::is_complex;
@@ -73,7 +79,11 @@ template <typename T> T clip(const T &x, const T &min, const T &max)
     }
 }
 
-template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
+template <typename T,
+          int vec_sz = 4,
+          int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+class ClipContigFunctor
 {
 private:
     size_t nelems = 0;
@@ -96,7 +106,7 @@ template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
     void operator()(sycl::nd_item<1> ndit) const
     {
         using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<T>::value) {
+        if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
             std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
 
@@ -195,10 +205,30 @@ sycl::event clip_contig_impl(sycl::queue &q,
         const auto gws_range = sycl::range<1>(n_groups * lws);
         const auto lws_range = sycl::range<1>(lws);
 
-        cgh.parallel_for<clip_contig_kernel<T, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            ClipContigFunctor<T, vec_sz, n_vecs>(nelems, x_tp, min_tp, max_tp,
-                                                 dst_tp));
+        if (is_aligned<required_alignment>(x_cp) &&
+            is_aligned<required_alignment>(min_cp) &&
+            is_aligned<required_alignment>(max_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>(
+                    nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>(
+                    nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
     });
 
     return clip_ev;

@@ -29,6 +29,7 @@
 #include <sycl/sycl.hpp>
 #include <type_traits>
 
+#include "kernels/alignment.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/type_utils.hpp"
 
@@ -44,6 +45,11 @@ namespace copy_and_cast
 namespace py = pybind11;
 using namespace dpctl::tensor::offset_utils;
 
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
 template <typename srcT, typename dstT, typename IndexerT>
 class copy_cast_generic_kernel;
 
@@ -200,7 +206,8 @@ template <typename srcT,
           typename dstT,
           typename CastFnT,
           int vec_sz = 4,
-          int n_vecs = 2>
+          int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 class ContigCopyFunctor
 {
 private:
@@ -219,7 +226,9 @@ class ContigCopyFunctor
         CastFnT fn{};
 
         using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<srcT>::value || is_complex<dstT>::value) {
+        if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
+                      is_complex<dstT>::value)
+        {
             std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
             size_t base = ndit.get_global_linear_id();
 
@@ -326,10 +335,32 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
         const auto gws_range = sycl::range<1>(n_groups * lws);
         const auto lws_range = sycl::range<1>(lws);
 
-        cgh.parallel_for<copy_cast_contig_kernel<srcTy, dstTy, n_vecs, vec_sz>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
-                              n_vecs>(nelems, src_tp, dst_tp));
+        if (is_aligned<required_alignment>(src_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            constexpr bool enable_sg_loadstore = true;
+            using KernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, enable_sg_loadstore>(nelems, src_tp,
+                                                               dst_tp));
+        }
+        else {
+            constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, disable_sg_loadstore>(nelems, src_tp,
+                                                                dst_tp));
+        }
     });
 
     return copy_and_cast_ev;

@@ -132,9 +132,15 @@ template <typename argT, typename resT> struct AbsFunctor
 template <typename argT,
           typename resT = argT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AbsContigFunctor = elementwise_common::
-    UnaryContigFunctor<argT, resT, AbsFunctor<argT, resT>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AbsContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           AbsFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename T> struct AbsOutputType
 {

@@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AcosFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AcosContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, AcosFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AcosContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AcosFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AcosStridedFunctor = elementwise_common::

@@ -167,13 +167,15 @@ template <typename argT, typename resT> struct AcoshFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AcoshContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AcoshFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AcoshStridedFunctor = elementwise_common::

@@ -123,14 +123,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AddContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             AddFunctor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using AddStridedFunctor =
@@ -425,13 +427,15 @@ template <typename argT, typename resT> struct AddInplaceFunctor
 template <typename argT,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
     argT,
     resT,
     AddInplaceFunctor<argT, resT>,
     vec_sz,
-    n_vecs>;
+    n_vecs,
+    enable_sg_loadstore>;
 
 template <typename argT, typename resT, typename IndexerT>
 using AddInplaceStridedFunctor =

@@ -80,13 +80,15 @@ template <typename argT, typename resT> struct AngleFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AngleContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AngleFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AngleStridedFunctor = elementwise_common::

@@ -169,9 +169,15 @@ template <typename argT, typename resT> struct AsinFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AsinContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, AsinFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AsinContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AsinFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AsinStridedFunctor = elementwise_common::

@@ -143,13 +143,15 @@ template <typename argT, typename resT> struct AsinhFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using AsinhContigFunctor =
     elementwise_common::UnaryContigFunctor<argTy,
                                            resTy,
                                            AsinhFunctor<argTy, resTy>,
                                            vec_sz,
-                                           n_vecs>;
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AsinhStridedFunctor = elementwise_common::

@@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AtanFunctor
 template <typename argTy,
           typename resTy = argTy,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
-using AtanContigFunctor = elementwise_common::
-    UnaryContigFunctor<argTy, resTy, AtanFunctor<argTy, resTy>, vec_sz, n_vecs>;
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+using AtanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AtanFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
 
 template <typename argTy, typename resTy, typename IndexerT>
 using AtanStridedFunctor = elementwise_common::

@@ -70,14 +70,16 @@ template <typename argT1,
           typename argT2,
           typename resT,
           unsigned int vec_sz = 4,
-          unsigned int n_vecs = 2>
+          unsigned int n_vecs = 2,
+          bool enable_sg_loadstore = true>
 using Atan2ContigFunctor =
     elementwise_common::BinaryContigFunctor<argT1,
                                             argT2,
                                             resT,
                                             Atan2Functor<argT1, argT2, resT>,
                                             vec_sz,
-                                            n_vecs>;
+                                            n_vecs,
+                                            enable_sg_loadstore>;
 
 template <typename argT1, typename argT2, typename resT, typename IndexerT>
 using Atan2StridedFunctor =