From ee7d6fb7a136cf2ead23ad2c2e2250625f999928 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 20 Nov 2024 14:28:12 -0600 Subject: [PATCH 1/4] Stylistic changes in accumulators.hpp Use explicit type instead of auto for readability, save common sub-expression to a temporary variable. Move std::array variable declaration closer to its first use. --- .../libtensor/include/kernels/accumulators.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index f4a97c20b9..c364e78fb1 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -215,16 +215,17 @@ inclusive_scan_base_step(sycl::queue &exec_q, const size_t gid = it.get_global_id(0); const size_t lid = it.get_local_id(0); - const size_t iter_gid = gid / (acc_groups * wg_size); - const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size); + const size_t reduce_chunks = acc_groups * wg_size; + const size_t iter_gid = gid / reduce_chunks; + const size_t chunk_gid = gid - (iter_gid * reduce_chunks); - std::array local_iscan; - - size_t i = chunk_gid * n_wi; + const size_t i = chunk_gid * n_wi; const auto &iter_offsets = iter_indexer(iter_gid); const auto &inp_iter_offset = iter_offsets.get_first_offset(); const auto &out_iter_offset = iter_offsets.get_second_offset(); + std::array local_iscan; + #pragma unroll for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { const size_t i_m_wi = i + m_wi; @@ -324,7 +325,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, std::vector &host_tasks, const std::vector &depends = {}) { - ScanOpT scan_op = ScanOpT(); + ScanOpT scan_op{}; constexpr outputT identity = su_ns::Identity::value; constexpr size_t _iter_nelems = 1; @@ -352,9 +353,9 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, size_t n_groups_ = n_groups; size_t temp_size = 0; while (n_groups_ > 1) { - const auto this_size = (n_groups_ - 1); + const size_t this_size = (n_groups_ - 1); temp_size += this_size; - n_groups_ = ceiling_quotient(this_size, chunk_size); + n_groups_ = ceiling_quotient(this_size, chunk_size); } // allocate From d79831e03d93df2a3978d97b2621a8a1c34d5339 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 20 Nov 2024 14:29:39 -0600 Subject: [PATCH 2/4] Fix a lapse in logic for computing upper bound of iteration This fixes gh-1901 Ensure that both start and end indices are within reduction bounds --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index c364e78fb1..64ed566151 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -280,8 +280,9 @@ inclusive_scan_base_step(sycl::queue &exec_q, local_iscan[m_wi] = scan_op(local_iscan[m_wi], addand); } - const nwiT m_max = - std::min(n_wi, std::max(i, acc_nelems) - i); + const size_t start = std::min(i, acc_nelems); + const size_t end = std::min(i + n_wi, acc_nelems); + const nwiT m_max = static_cast(end - start); for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) { output[out_iter_offset + out_indexer(i + m_wi)] = local_iscan[m_wi]; From abb157298242f610d64ea619119c64f0396796a7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 20 Nov 2024 14:33:33 -0600 Subject: [PATCH 3/4] Add changelog entry for fixing of gh-1901 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b50f2e640..5523c684ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Fix for `tensor.result_type` when all inputs are Python built-in scalars [gh-1877](https://github.com/IntelPython/dpctl/pull/1877) * Improved error in constructors `tensor.full` and `tensor.full_like` when provided a non-numeric fill value [gh-1878](https://github.com/IntelPython/dpctl/pull/1878) * Added a check for pointer alignment when copying to C-contiguous memory [gh-1890](https://github.com/IntelPython/dpctl/pull/1890) +* Fixed incorrect result (issue [gh-1901](https://github.com/IntelPython/dpctl/issues/1901)) in `tensor.cumulative_sum` and in advanced indexing [gh-1902](https://github.com/IntelPython/dpctl/pull/1902) ### Maintenance From d133a1137bfa31d8ae30ad85960a075ad09ad1c6 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 20 Nov 2024 15:50:32 -0600 Subject: [PATCH 4/4] Add a test based on example from gh-1901 --- dpctl/tests/test_tensor_accumulation.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dpctl/tests/test_tensor_accumulation.py b/dpctl/tests/test_tensor_accumulation.py index 557cd3b3ff..4ce519ffe4 100644 --- a/dpctl/tests/test_tensor_accumulation.py +++ b/dpctl/tests/test_tensor_accumulation.py @@ -410,3 +410,14 @@ def test_cumulative_logsumexp_closed_form(fpdt): tol = 4 * dpt.finfo(fpdt).eps assert dpt.allclose(r, expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("p", [257, 260, 273, 280, 509, 512]) +def test_cumulative_sum_gh_1901(p): + get_queue_or_skip() + + n = p * p + dt = dpt.int32 + inp = dpt.ones(n, dtype=dt) + r = dpt.cumulative_sum(inp, dtype=dt) + assert dpt.all(r == dpt.arange(1, n + 1, dtype=dt))