From ef747f3a94c13f7d87f44014e7f1641bc2ea1dd9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 31 Mar 2024 23:59:24 -0500 Subject: [PATCH] Need barrier after call to custom inclusive scan to avoid race condition added comments explaining why barriers are needed --- .../libtensor/include/kernels/accumulators.hpp | 3 +++ .../libtensor/include/utils/sycl_utils.hpp | 16 +++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index aba0a89dab..18e0e1bc8a 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -290,6 +290,9 @@ inclusive_scan_base_step(sycl::queue &exec_q, else { wg_iscan_val = su_ns::custom_inclusive_scan_over_group( it.get_group(), slm_iscan_tmp, local_iscan.back(), scan_op); + // ensure all finished reading from SLM, to avoid race condition + // with subsequent writes into SLM + it.barrier(sycl::access::fence_space::local_space); } slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index cbc96f7fb7..75ab1d9341 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -25,6 +25,7 @@ #pragma once #include #include +#include #include #include #include @@ -160,22 +161,23 @@ T custom_inclusive_scan_over_group(const GroupT &wg, const T local_val, const OpT &op) { - auto local_id = wg.get_local_id(0); - auto wgs = wg.get_local_range(0); + const std::uint32_t local_id = wg.get_local_id(0); + const std::uint32_t wgs = wg.get_local_range(0); local_mem_acc[local_id] = local_val; sycl::group_barrier(wg, sycl::memory_scope::work_group); if (wg.leader()) { - for (size_t i = 1; i < wgs; ++i) { - local_mem_acc[i] = op(local_mem_acc[i], local_mem_acc[i - 1]); + T scan_val = local_mem_acc[0]; + for (std::uint32_t i = 1; i < wgs; ++i) { + scan_val = op(local_mem_acc[i], scan_val); + local_mem_acc[i] = scan_val; } } - T accumulated_local_val = local_mem_acc[local_id]; + // ensure all work-items see the same SLM that leader updated sycl::group_barrier(wg, sycl::memory_scope::work_group); - - return accumulated_local_val; + return local_mem_acc[local_id]; } // Reduction functors