@@ -113,11 +113,6 @@ template <typename T> class stack_t
113
113
return src_;
114
114
}
115
115
116
- const T *get_src_const_ptr () const
117
- {
118
- return src_;
119
- }
120
-
121
116
size_t get_size () const
122
117
{
123
118
return size_;
@@ -150,11 +145,6 @@ template <typename T> class stack_strided_t
150
145
return src_;
151
146
}
152
147
153
- const T *get_src_const_ptr () const
154
- {
155
- return src_;
156
- }
157
-
158
148
size_t get_size () const
159
149
{
160
150
return size_;
@@ -247,16 +237,16 @@ inclusive_scan_base_step(sycl::queue &exec_q,
247
237
cgh.parallel_for <KernelName>(ndRange, [=, slm_iscan_tmp =
248
238
std::move (slm_iscan_tmp)](
249
239
sycl::nd_item<1 > it) {
250
- size_t gid = it.get_global_id (0 );
251
- size_t lid = it.get_local_id (0 );
240
+ const size_t gid = it.get_global_id (0 );
241
+ const size_t lid = it.get_local_id (0 );
252
242
253
- size_t iter_gid = gid / (acc_groups * wg_size);
254
- size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
243
+ const size_t iter_gid = gid / (acc_groups * wg_size);
244
+ const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
255
245
256
246
std::array<outputT, n_wi> local_iscan;
257
247
258
248
size_t i = chunk_gid * n_wi;
259
- auto iter_offsets = iter_indexer (iter_gid);
249
+ const auto & iter_offsets = iter_indexer (iter_gid);
260
250
const auto &inp_iter_offset = iter_offsets.get_first_offset ();
261
251
const auto &out_iter_offset = iter_offsets.get_second_offset ();
262
252
@@ -377,7 +367,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
377
367
378
368
sycl::event dependent_event = inc_scan_phase1_ev;
379
369
if (n_groups > 1 ) {
380
- auto chunk_size = wg_size * n_wi;
370
+ const size_t chunk_size = wg_size * n_wi;
381
371
382
372
// how much of temporary allocation do we need
383
373
size_t n_groups_ = n_groups;
@@ -407,7 +397,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
407
397
size_t size_to_update = n_elems;
408
398
while (n_groups_ > 1 ) {
409
399
410
- size_t src_size = n_groups_ - 1 ;
400
+ const size_t src_size = n_groups_ - 1 ;
411
401
dependent_event =
412
402
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
413
403
NoOpIndexerT, NoOpIndexerT,
@@ -426,19 +416,19 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
426
416
for (size_t reverse_stack_id = 0 ; reverse_stack_id < stack.size ();
427
417
++reverse_stack_id)
428
418
{
429
- auto stack_id = stack.size () - 1 - reverse_stack_id;
419
+ const size_t stack_id = stack.size () - 1 - reverse_stack_id;
430
420
431
- auto stack_elem = stack[stack_id];
421
+ const auto & stack_elem = stack[stack_id];
432
422
outputT *src = stack_elem.get_src_ptr ();
433
- size_t src_size = stack_elem.get_size ();
423
+ const size_t src_size = stack_elem.get_size ();
434
424
outputT *local_scans = stack_elem.get_local_scans_ptr ();
435
425
436
426
// output[ chunk_size * (i + 1) + j] += temp[i]
437
427
dependent_event = exec_q.submit ([&](sycl::handler &cgh) {
438
428
cgh.depends_on (dependent_event);
439
429
440
430
constexpr nwiT updates_per_wi = n_wi;
441
- size_t n_items = ceiling_quotient<size_t >(src_size, n_wi);
431
+ const size_t n_items = ceiling_quotient<size_t >(src_size, n_wi);
442
432
443
433
using UpdateKernelName =
444
434
class inclusive_scan_1d_iter_chunk_update_krn <
@@ -448,12 +438,12 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
448
438
cgh.parallel_for <UpdateKernelName>(
449
439
{n_items}, [chunk_size, src, src_size, local_scans, scan_op,
450
440
identity](auto wiid) {
451
- auto gid = n_wi * wiid[0 ];
441
+ const size_t gid = n_wi * wiid[0 ];
452
442
#pragma unroll
453
- for (auto i = 0 ; i < updates_per_wi; ++i) {
454
- auto src_id = gid + i;
443
+ for (size_t i = 0 ; i < updates_per_wi; ++i) {
444
+ const size_t src_id = gid + i;
455
445
if (src_id < src_size) {
456
- auto scan_id = (src_id / chunk_size);
446
+ const size_t scan_id = (src_id / chunk_size);
457
447
src[src_id] =
458
448
(scan_id > 0 )
459
449
? scan_op (src[src_id],
@@ -511,7 +501,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
511
501
const sycl::device &dev = q.get_device ();
512
502
if (dev.has (sycl::aspect::cpu)) {
513
503
constexpr nwiT n_wi_for_cpu = 8 ;
514
- size_t wg_size = 256 ;
504
+ const size_t wg_size = 256 ;
515
505
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
516
506
transformerT, AccumulateOpT,
517
507
include_initial>(
@@ -520,7 +510,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
520
510
}
521
511
else {
522
512
constexpr nwiT n_wi_for_gpu = 4 ;
523
- size_t wg_size = 256 ;
513
+ const size_t wg_size = 256 ;
524
514
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
525
515
transformerT, AccumulateOpT,
526
516
include_initial>(
@@ -586,13 +576,13 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
586
576
587
577
sycl::event dependent_event = inc_scan_phase1_ev;
588
578
if (acc_groups > 1 ) {
589
- auto chunk_size = wg_size * n_wi;
579
+ const size_t chunk_size = wg_size * n_wi;
590
580
591
581
// how much of temporary allocation do we need
592
582
size_t acc_groups_ = acc_groups;
593
583
size_t temp_size = 0 ;
594
584
while (acc_groups_ > 1 ) {
595
- const auto this_size = (acc_groups_ - 1 );
585
+ const size_t this_size = (acc_groups_ - 1 );
596
586
temp_size += this_size;
597
587
acc_groups_ = ceiling_quotient<size_t >(this_size, chunk_size);
598
588
}
@@ -622,14 +612,15 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
622
612
size_t src_size = acc_groups - 1 ;
623
613
using LocalScanIndexerT =
624
614
dpctl::tensor::offset_utils::Strided1DIndexer;
625
- LocalScanIndexerT scan_iter_indexer{
615
+ const LocalScanIndexerT scan_iter_indexer{
626
616
0 , static_cast <ssize_t >(iter_nelems),
627
617
static_cast <ssize_t >(src_size)};
628
618
629
619
using IterIndexerT =
630
620
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
631
621
OutIterIndexerT, LocalScanIndexerT>;
632
- IterIndexerT iter_indexer_{out_iter_indexer, scan_iter_indexer};
622
+ const IterIndexerT iter_indexer_{out_iter_indexer,
623
+ scan_iter_indexer};
633
624
634
625
dependent_event =
635
626
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
@@ -651,17 +642,18 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
651
642
652
643
using LocalScanIndexerT =
653
644
dpctl::tensor::offset_utils::Strided1DIndexer;
654
- LocalScanIndexerT scan1_iter_indexer{
645
+ const LocalScanIndexerT scan1_iter_indexer{
655
646
0 , static_cast <ssize_t >(iter_nelems),
656
647
static_cast <ssize_t >(size_to_update)};
657
- LocalScanIndexerT scan2_iter_indexer{
648
+ const LocalScanIndexerT scan2_iter_indexer{
658
649
0 , static_cast <ssize_t >(iter_nelems),
659
650
static_cast <ssize_t >(src_size)};
660
651
661
652
using IterIndexerT =
662
653
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
663
654
LocalScanIndexerT, LocalScanIndexerT>;
664
- IterIndexerT iter_indexer_{scan1_iter_indexer, scan2_iter_indexer};
655
+ const IterIndexerT iter_indexer_{scan1_iter_indexer,
656
+ scan2_iter_indexer};
665
657
666
658
dependent_event =
667
659
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
@@ -681,16 +673,16 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
681
673
for (size_t reverse_stack_id = 0 ; reverse_stack_id < stack.size () - 1 ;
682
674
++reverse_stack_id)
683
675
{
684
- auto stack_id = stack.size () - 1 - reverse_stack_id;
676
+ const size_t stack_id = stack.size () - 1 - reverse_stack_id;
685
677
686
- auto stack_elem = stack[stack_id];
678
+ const auto & stack_elem = stack[stack_id];
687
679
outputT *src = stack_elem.get_src_ptr ();
688
680
size_t src_size = stack_elem.get_size ();
689
681
outputT *local_scans = stack_elem.get_local_scans_ptr ();
690
682
size_t local_stride = stack_elem.get_local_stride ();
691
683
692
684
constexpr nwiT updates_per_wi = n_wi;
693
- size_t update_nelems =
685
+ const size_t update_nelems =
694
686
ceiling_quotient<size_t >(src_size, updates_per_wi);
695
687
696
688
dependent_event = exec_q.submit ([&](sycl::handler &cgh) {
@@ -705,21 +697,23 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
705
697
{iter_nelems * update_nelems},
706
698
[chunk_size, update_nelems, src_size, local_stride, src,
707
699
local_scans, scan_op, identity](auto wiid) {
708
- size_t gid = wiid[0 ];
700
+ const size_t gid = wiid[0 ];
709
701
710
- size_t iter_gid = gid / update_nelems;
711
- size_t axis_gid = gid - (iter_gid * update_nelems);
702
+ const size_t iter_gid = gid / update_nelems;
703
+ const size_t axis_gid =
704
+ gid - (iter_gid * update_nelems);
712
705
713
- size_t src_axis_id0 = axis_gid * updates_per_wi;
714
- size_t src_iter_id = iter_gid * src_size;
706
+ const size_t src_axis_id0 = axis_gid * updates_per_wi;
707
+ const size_t src_iter_id = iter_gid * src_size;
715
708
#pragma unroll
716
709
for (nwiT i = 0 ; i < updates_per_wi; ++i) {
717
- size_t src_axis_id = src_axis_id0 + i;
718
- size_t src_id = src_axis_id + src_iter_id;
710
+ const size_t src_axis_id = src_axis_id0 + i;
711
+ const size_t src_id = src_axis_id + src_iter_id;
719
712
720
713
if (src_axis_id < src_size) {
721
- size_t scan_axis_id = src_axis_id / chunk_size;
722
- size_t scan_id =
714
+ const size_t scan_axis_id =
715
+ src_axis_id / chunk_size;
716
+ const size_t scan_id =
723
717
scan_axis_id + iter_gid * local_stride;
724
718
725
719
src[src_id] =
@@ -735,14 +729,14 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
735
729
736
730
// last stack element is always directly to output
737
731
{
738
- auto stack_elem = stack[0 ];
732
+ const auto & stack_elem = stack[0 ];
739
733
outputT *src = stack_elem.get_src_ptr ();
740
- size_t src_size = stack_elem.get_size ();
734
+ const size_t src_size = stack_elem.get_size ();
741
735
outputT *local_scans = stack_elem.get_local_scans_ptr ();
742
- size_t local_stride = stack_elem.get_local_stride ();
736
+ const size_t local_stride = stack_elem.get_local_stride ();
743
737
744
738
constexpr nwiT updates_per_wi = n_wi;
745
- size_t update_nelems =
739
+ const size_t update_nelems =
746
740
ceiling_quotient<size_t >(src_size, updates_per_wi);
747
741
748
742
dependent_event = exec_q.submit ([&](sycl::handler &cgh) {
@@ -759,22 +753,24 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
759
753
[chunk_size, update_nelems, src_size, local_stride, src,
760
754
local_scans, scan_op, identity, out_iter_indexer,
761
755
out_indexer](auto wiid) {
762
- size_t gid = wiid[0 ];
756
+ const size_t gid = wiid[0 ];
763
757
764
- size_t iter_gid = gid / update_nelems;
765
- size_t axis_gid = gid - (iter_gid * update_nelems);
758
+ const size_t iter_gid = gid / update_nelems;
759
+ const size_t axis_gid =
760
+ gid - (iter_gid * update_nelems);
766
761
767
- size_t src_axis_id0 = axis_gid * updates_per_wi;
768
- size_t src_iter_id = out_iter_indexer (iter_gid);
762
+ const size_t src_axis_id0 = axis_gid * updates_per_wi;
763
+ const size_t src_iter_id = out_iter_indexer (iter_gid);
769
764
#pragma unroll
770
765
for (nwiT i = 0 ; i < updates_per_wi; ++i) {
771
- size_t src_axis_id = src_axis_id0 + i;
772
- size_t src_id =
766
+ const size_t src_axis_id = src_axis_id0 + i;
767
+ const size_t src_id =
773
768
out_indexer (src_axis_id) + src_iter_id;
774
769
775
770
if (src_axis_id < src_size) {
776
- size_t scan_axis_id = src_axis_id / chunk_size;
777
- size_t scan_id =
771
+ const size_t scan_axis_id =
772
+ src_axis_id / chunk_size;
773
+ const size_t scan_id =
778
774
scan_axis_id + iter_gid * local_stride;
779
775
780
776
src[src_id] =
@@ -858,7 +854,7 @@ accumulate_strided_impl(sycl::queue &q,
858
854
sycl::event comp_ev;
859
855
if (dev.has (sycl::aspect::cpu)) {
860
856
constexpr nwiT n_wi_for_cpu = 8 ;
861
- size_t wg_size = 256 ;
857
+ const size_t wg_size = 256 ;
862
858
comp_ev =
863
859
inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
864
860
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -869,7 +865,7 @@ accumulate_strided_impl(sycl::queue &q,
869
865
}
870
866
else {
871
867
constexpr nwiT n_wi_for_gpu = 4 ;
872
- size_t wg_size = 256 ;
868
+ const size_t wg_size = 256 ;
873
869
comp_ev =
874
870
inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
875
871
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -914,7 +910,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
914
910
const sycl::device &dev = q.get_device ();
915
911
if (dev.has (sycl::aspect::cpu)) {
916
912
constexpr nwiT n_wi_for_cpu = 8 ;
917
- size_t wg_size = 256 ;
913
+ const size_t wg_size = 256 ;
918
914
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
919
915
NoOpIndexerT, transformerT,
920
916
AccumulateOpT, include_initial>(
@@ -923,7 +919,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
923
919
}
924
920
else {
925
921
constexpr nwiT n_wi_for_gpu = 4 ;
926
- size_t wg_size = 256 ;
922
+ const size_t wg_size = 256 ;
927
923
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
928
924
NoOpIndexerT, transformerT,
929
925
AccumulateOpT, include_initial>(
@@ -1022,7 +1018,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
1022
1018
sycl::event comp_ev;
1023
1019
if (dev.has (sycl::aspect::cpu)) {
1024
1020
constexpr nwiT n_wi_for_cpu = 8 ;
1025
- size_t wg_size = 256 ;
1021
+ const size_t wg_size = 256 ;
1026
1022
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
1027
1023
StridedIndexerT, transformerT,
1028
1024
AccumulateOpT, include_initial>(
@@ -1031,7 +1027,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
1031
1027
}
1032
1028
else {
1033
1029
constexpr nwiT n_wi_for_gpu = 4 ;
1034
- size_t wg_size = 256 ;
1030
+ const size_t wg_size = 256 ;
1035
1031
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
1036
1032
StridedIndexerT, transformerT,
1037
1033
AccumulateOpT, include_initial>(
0 commit comments