Skip to content

Commit 869dfb2

Browse files
Merge pull request #1902 from IntelPython/bugfix/gh-1901-scan-algo
Bugfix/gh 1901 scan algo
2 parents 9b3341a + d133a11 commit 869dfb2

File tree

3 files changed

+24
-10
lines changed

3 files changed

+24
-10
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2121
* Fix for `tensor.result_type` when all inputs are Python built-in scalars [gh-1877](https://github.com/IntelPython/dpctl/pull/1877)
2222
* Improved error in constructors `tensor.full` and `tensor.full_like` when provided a non-numeric fill value [gh-1878](https://github.com/IntelPython/dpctl/pull/1878)
2323
* Added a check for pointer alignment when copying to C-contiguous memory [gh-1890](https://github.com/IntelPython/dpctl/pull/1890)
24+
* Fixed incorrect result (issue [gh-1901](https://github.com/IntelPython/dpctl/issues/1901)) in `tensor.cumulative_sum` and in advanced indexing [gh-1902](https://github.com/IntelPython/dpctl/pull/1902)
2425

2526
### Maintenance
2627

dpctl/tensor/libtensor/include/kernels/accumulators.hpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -215,16 +215,17 @@ inclusive_scan_base_step(sycl::queue &exec_q,
215215
const size_t gid = it.get_global_id(0);
216216
const size_t lid = it.get_local_id(0);
217217

218-
const size_t iter_gid = gid / (acc_groups * wg_size);
219-
const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
218+
const size_t reduce_chunks = acc_groups * wg_size;
219+
const size_t iter_gid = gid / reduce_chunks;
220+
const size_t chunk_gid = gid - (iter_gid * reduce_chunks);
220221

221-
std::array<outputT, n_wi> local_iscan;
222-
223-
size_t i = chunk_gid * n_wi;
222+
const size_t i = chunk_gid * n_wi;
224223
const auto &iter_offsets = iter_indexer(iter_gid);
225224
const auto &inp_iter_offset = iter_offsets.get_first_offset();
226225
const auto &out_iter_offset = iter_offsets.get_second_offset();
227226

227+
std::array<outputT, n_wi> local_iscan;
228+
228229
#pragma unroll
229230
for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
230231
const size_t i_m_wi = i + m_wi;
@@ -279,8 +280,9 @@ inclusive_scan_base_step(sycl::queue &exec_q,
279280
local_iscan[m_wi] = scan_op(local_iscan[m_wi], addand);
280281
}
281282

282-
const nwiT m_max =
283-
std::min<nwiT>(n_wi, std::max(i, acc_nelems) - i);
283+
const size_t start = std::min(i, acc_nelems);
284+
const size_t end = std::min(i + n_wi, acc_nelems);
285+
const nwiT m_max = static_cast<nwiT>(end - start);
284286
for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) {
285287
output[out_iter_offset + out_indexer(i + m_wi)] =
286288
local_iscan[m_wi];
@@ -324,7 +326,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
324326
std::vector<sycl::event> &host_tasks,
325327
const std::vector<sycl::event> &depends = {})
326328
{
327-
ScanOpT scan_op = ScanOpT();
329+
ScanOpT scan_op{};
328330
constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
329331

330332
constexpr size_t _iter_nelems = 1;
@@ -352,9 +354,9 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
352354
size_t n_groups_ = n_groups;
353355
size_t temp_size = 0;
354356
while (n_groups_ > 1) {
355-
const auto this_size = (n_groups_ - 1);
357+
const size_t this_size = (n_groups_ - 1);
356358
temp_size += this_size;
357-
n_groups_ = ceiling_quotient<size_t>(this_size, chunk_size);
359+
n_groups_ = ceiling_quotient(this_size, chunk_size);
358360
}
359361

360362
// allocate

dpctl/tests/test_tensor_accumulation.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,3 +410,14 @@ def test_cumulative_logsumexp_closed_form(fpdt):
410410

411411
tol = 4 * dpt.finfo(fpdt).eps
412412
assert dpt.allclose(r, expected, atol=tol, rtol=tol)
413+
414+
415+
@pytest.mark.parametrize("p", [257, 260, 273, 280, 509, 512])
416+
def test_cumulative_sum_gh_1901(p):
417+
get_queue_or_skip()
418+
419+
n = p * p
420+
dt = dpt.int32
421+
inp = dpt.ones(n, dtype=dt)
422+
r = dpt.cumulative_sum(inp, dtype=dt)
423+
assert dpt.all(r == dpt.arange(1, n + 1, dtype=dt))

0 commit comments

Comments
 (0)