Skip to content

Commit 0d59c8f

Browse files
For HIP devices use smaller wg-size parameter
single_step_scan_striped does not produce correct results for wg_size > 64, and tests fail.
1 parent ce4cf5d commit 0d59c8f

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

dpctl/tensor/libtensor/include/kernels/accumulators.hpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,10 @@ accumulate_1d_contig_impl(sycl::queue &q,
777777
}
778778
else {
779779
constexpr nwiT n_wi_for_gpu = 4;
780-
const std::size_t wg_size = 256;
780+
// base_scan_striped algorithm does not execute correctly
781+
// on HIP device with wg_size > 64
782+
const std::size_t wg_size =
783+
(q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
781784
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
782785
transformerT, AccumulateOpT,
783786
include_initial>(
@@ -1181,7 +1184,10 @@ accumulate_strided_impl(sycl::queue &q,
11811184
}
11821185
else {
11831186
constexpr nwiT n_wi_for_gpu = 4;
1184-
const std::size_t wg_size = 256;
1187+
// base_scan_striped algorithm does not execute correctly
1188+
// on HIP device with wg_size > 64
1189+
const std::size_t wg_size =
1190+
(q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
11851191
comp_ev =
11861192
inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
11871193
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -1235,7 +1241,10 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
12351241
}
12361242
else {
12371243
constexpr nwiT n_wi_for_gpu = 4;
1238-
const std::size_t wg_size = 256;
1244+
// base_scan_striped algorithm does not execute correctly
1245+
// on HIP device with wg_size > 64
1246+
const std::size_t wg_size =
1247+
(q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
12391248
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
12401249
NoOpIndexerT, transformerT,
12411250
AccumulateOpT, include_initial>(
@@ -1346,7 +1355,10 @@ cumsum_val_strided_impl(sycl::queue &q,
13461355
}
13471356
else {
13481357
constexpr nwiT n_wi_for_gpu = 4;
1349-
const std::size_t wg_size = 256;
1358+
// base_scan_striped algorithm does not execute correctly
1359+
// on HIP device with wg_size > 64
1360+
const std::size_t wg_size =
1361+
(q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
13501362
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
13511363
StridedIndexerT, transformerT,
13521364
AccumulateOpT, include_initial>(

0 commit comments

Comments
 (0)