@@ -777,7 +777,10 @@ accumulate_1d_contig_impl(sycl::queue &q,
777
777
}
778
778
else {
779
779
constexpr nwiT n_wi_for_gpu = 4 ;
780
- const std::size_t wg_size = 256 ;
780
+ // base_scan_striped algorithm does not execute correctly
781
+ // on HIP device with wg_size > 64
782
+ const std::size_t wg_size =
783
+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
781
784
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
782
785
transformerT, AccumulateOpT,
783
786
include_initial>(
@@ -1181,7 +1184,10 @@ accumulate_strided_impl(sycl::queue &q,
1181
1184
}
1182
1185
else {
1183
1186
constexpr nwiT n_wi_for_gpu = 4 ;
1184
- const std::size_t wg_size = 256 ;
1187
+ // base_scan_striped algorithm does not execute correctly
1188
+ // on HIP device with wg_size > 64
1189
+ const std::size_t wg_size =
1190
+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
1185
1191
comp_ev =
1186
1192
inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
1187
1193
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -1235,7 +1241,10 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
1235
1241
}
1236
1242
else {
1237
1243
constexpr nwiT n_wi_for_gpu = 4 ;
1238
- const std::size_t wg_size = 256 ;
1244
+ // base_scan_striped algorithm does not execute correctly
1245
+ // on HIP device with wg_size > 64
1246
+ const std::size_t wg_size =
1247
+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
1239
1248
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
1240
1249
NoOpIndexerT, transformerT,
1241
1250
AccumulateOpT, include_initial>(
@@ -1346,7 +1355,10 @@ cumsum_val_strided_impl(sycl::queue &q,
1346
1355
}
1347
1356
else {
1348
1357
constexpr nwiT n_wi_for_gpu = 4 ;
1349
- const std::size_t wg_size = 256 ;
1358
+ // base_scan_striped algorithm does not execute correctly
1359
+ // on HIP device with wg_size > 64
1360
+ const std::size_t wg_size =
1361
+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
1350
1362
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
1351
1363
StridedIndexerT, transformerT,
1352
1364
AccumulateOpT, include_initial>(
0 commit comments