From e2d2b3f2686122961cbbf9e096cafb9a52b9b048 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 25 Feb 2025 09:21:38 -0800 Subject: [PATCH 1/6] Add missing kernels to arithmetic accumulators bool->bool and int8->int8 overloads were missing in cumulative_sum and cumulative_prod --- .../libtensor/include/kernels/accumulators.hpp | 10 +++++++++- .../source/accumulators/cumulative_prod.cpp | 18 ++++++++++++++---- .../source/accumulators/cumulative_sum.cpp | 18 ++++++++++++++---- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 731807c7d0..22229b3d9d 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -84,9 +84,17 @@ template struct CastTransformer } }; +template struct needs_workaround +{ + static constexpr bool value = + std::is_same_v> || + std::is_same_v>; +}; + template struct can_use_inclusive_scan_over_group { - static constexpr bool value = sycl::has_known_identity::value; + static constexpr bool value = sycl::has_known_identity::value && + !needs_workaround::value; }; namespace detail diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp index b9b3b684f6..234d10c0d2 100644 --- a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -70,10 +70,12 @@ template struct TypePairSupportDataForProdAccumulation { static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, // input int8_t + td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, @@ -138,7 +140,9 @@ struct CumProd1DContigFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = sycl::multiplies; + using ScanOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -171,7 +175,9 @@ struct CumProd1DIncludeInitialContigFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = sycl::multiplies; + using ScanOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -204,7 +210,9 @@ struct CumProdStridedFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = sycl::multiplies; + using ScanOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -237,7 +245,9 @@ struct CumProdIncludeInitialStridedFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = sycl::multiplies; + using ScanOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp index 2e6cfddfb6..831c35d808 100644 --- a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -70,10 +70,12 @@ template struct TypePairSupportDataForSumAccumulation { static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, // input int8_t + td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, @@ -138,7 +140,9 @@ struct CumSum1DContigFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = sycl::plus; + using ScanOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -171,7 +175,9 @@ struct CumSum1DIncludeInitialContigFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = sycl::plus; + using ScanOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -204,7 +210,9 @@ struct CumSumStridedFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = sycl::plus; + using ScanOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -237,7 +245,9 @@ struct CumSumIncludeInitialStridedFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = sycl::plus; + using ScanOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; From 47239cdd6f01bb09e2f010eb82ce2a56c3253776 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 25 Feb 2025 09:21:48 -0800 Subject: [PATCH 2/6] Add tests based on gh-2017 --- dpctl/tests/test_tensor_accumulation.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dpctl/tests/test_tensor_accumulation.py b/dpctl/tests/test_tensor_accumulation.py index 962d2742a0..9c8eec91d1 100644 --- a/dpctl/tests/test_tensor_accumulation.py +++ b/dpctl/tests/test_tensor_accumulation.py @@ -421,3 +421,15 @@ def test_cumulative_sum_gh_1901(p): inp = dpt.ones(n, dtype=dt) r = dpt.cumulative_sum(inp, dtype=dt) assert dpt.all(r == dpt.arange(1, n + 1, dtype=dt)) + + +@pytest.mark.parametrize( + "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"] +) +def test_gh_2017(dt): + "See https://github.com/IntelPython/dpctl/issues/2017" + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q) + r = dpt.cumulative_sum(x, dtype="?") + assert dpt.all(r) From 48b0e663a61f4156e4d3c6af9ef00fb64b7fe000 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 25 Feb 2025 19:33:29 -0800 Subject: [PATCH 3/6] Add comment specifying need for work-around in accumulators --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 22229b3d9d..627dfb602b 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -86,6 +86,7 @@ template struct CastTransformer template struct needs_workaround { + // work-around needed due to crash in JITing on CPU static constexpr bool value = std::is_same_v> || std::is_same_v>; From c58d77508a60ad6d4705d3c1cbf4559eef774d17 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 26 Feb 2025 11:10:07 -0800 Subject: [PATCH 4/6] Reuse SYCL utils in workaround for logical operators in accumulators --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 627dfb602b..eb4258ea8a 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -47,6 +47,8 @@ namespace kernels namespace accumulators { +namespace su_ns = dpctl::tensor::sycl_utils; + using dpctl::tensor::ssize_t; using namespace dpctl::tensor::offset_utils; @@ -87,9 +89,8 @@ template struct CastTransformer template struct needs_workaround { // work-around needed due to crash in JITing on CPU - static constexpr bool value = - std::is_same_v> || - std::is_same_v>; + static constexpr bool value = su_ns::IsSyclLogicalAnd::value || + su_ns::IsSyclLogicalOr::value; }; template struct can_use_inclusive_scan_over_group @@ -153,8 +154,6 @@ template class stack_strided_t // Iterative cumulative summation -namespace su_ns = dpctl::tensor::sycl_utils; - using nwiT = std::uint32_t; template Date: Wed, 26 Feb 2025 14:34:35 -0800 Subject: [PATCH 5/6] Alias conditional binary op type for cumulative_sum and cumulative_prod Reduces code repetition --- .../source/accumulators/cumulative_prod.cpp | 21 ++++++++----------- .../source/accumulators/cumulative_sum.cpp | 20 +++++++----------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp index 234d10c0d2..045b1b330e 100644 --- a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -132,6 +132,11 @@ struct TypePairSupportDataForProdAccumulation td_ns::NotDefinedEntry>::is_defined; }; +template +using CumProdScanOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + template struct CumProd1DContigFactory { @@ -140,9 +145,7 @@ struct CumProd1DContigFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = std::conditional_t, - sycl::logical_and, - sycl::multiplies>; + using ScanOpT = CumProdScanOpT; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -175,9 +178,7 @@ struct CumProd1DIncludeInitialContigFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = std::conditional_t, - sycl::logical_and, - sycl::multiplies>; + using ScanOpT = CumProdScanOpT; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -210,9 +211,7 @@ struct CumProdStridedFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = std::conditional_t, - sycl::logical_and, - sycl::multiplies>; + using ScanOpT = CumProdScanOpT; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -245,9 +244,7 @@ struct CumProdIncludeInitialStridedFactory if constexpr (TypePairSupportDataForProdAccumulation::is_defined) { - using ScanOpT = std::conditional_t, - sycl::logical_and, - sycl::multiplies>; + using ScanOpT = CumProdScanOpT; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp index 831c35d808..e44678e15f 100644 --- a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -132,6 +132,10 @@ struct TypePairSupportDataForSumAccumulation td_ns::NotDefinedEntry>::is_defined; }; +template +using CumSumScanOpT = std:: + conditional_t, sycl::logical_or, sycl::plus>; + template struct CumSum1DContigFactory { @@ -140,9 +144,7 @@ struct CumSum1DContigFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = - std::conditional_t, - sycl::logical_or, sycl::plus>; + using ScanOpT = CumSumScanOpT; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -175,9 +177,7 @@ struct CumSum1DIncludeInitialContigFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = - std::conditional_t, - sycl::logical_or, sycl::plus>; + using ScanOpT = CumSumScanOpT; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -210,9 +210,7 @@ struct CumSumStridedFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = - std::conditional_t, - sycl::logical_or, sycl::plus>; + using ScanOpT = CumSumScanOpT; constexpr bool include_initial = false; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; @@ -245,9 +243,7 @@ struct CumSumIncludeInitialStridedFactory if constexpr (TypePairSupportDataForSumAccumulation::is_defined) { - using ScanOpT = - std::conditional_t, - sycl::logical_or, sycl::plus>; + using ScanOpT = CumSumScanOpT; constexpr bool include_initial = true; if constexpr (std::is_same_v) { using dpctl::tensor::kernels::accumulators::NoOpTransformer; From fa525a70426168a781df1cc0790e37645f509fa4 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 27 Feb 2025 18:47:22 -0800 Subject: [PATCH 6/6] Add comment to needs_workaround in accumulators.hpp Documents the ticket reported for the JIT failure on CPU --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index eb4258ea8a..f0f72aee82 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -88,7 +88,8 @@ template struct CastTransformer template struct needs_workaround { - // work-around needed due to crash in JITing on CPU + // workaround needed due to crash in JITing on CPU + // remove when CMPLRLLVM-65813 is resolved static constexpr bool value = su_ns::IsSyclLogicalAnd::value || su_ns::IsSyclLogicalOr::value; };