Skip to content

Commit 6520b8b

Browse files
Use const qualifiers to make compiler's job easier
Indexers are made const, integral variables in kernels made const too Make two-offset instances const references to avoid copying. Gor rid of get_src_const_ptr unused methods in stack_t structs. Replaced auto with size_t as appropriate. Added const to make compiler analysis easier (and faster).
1 parent d947ba8 commit 6520b8b

File tree

1 file changed

+61
-65
lines changed

1 file changed

+61
-65
lines changed

dpctl/tensor/libtensor/include/kernels/accumulators.hpp

Lines changed: 61 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,6 @@ template <typename T> class stack_t
113113
return src_;
114114
}
115115

116-
const T *get_src_const_ptr() const
117-
{
118-
return src_;
119-
}
120-
121116
size_t get_size() const
122117
{
123118
return size_;
@@ -150,11 +145,6 @@ template <typename T> class stack_strided_t
150145
return src_;
151146
}
152147

153-
const T *get_src_const_ptr() const
154-
{
155-
return src_;
156-
}
157-
158148
size_t get_size() const
159149
{
160150
return size_;
@@ -247,16 +237,16 @@ inclusive_scan_base_step(sycl::queue &exec_q,
247237
cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
248238
std::move(slm_iscan_tmp)](
249239
sycl::nd_item<1> it) {
250-
size_t gid = it.get_global_id(0);
251-
size_t lid = it.get_local_id(0);
240+
const size_t gid = it.get_global_id(0);
241+
const size_t lid = it.get_local_id(0);
252242

253-
size_t iter_gid = gid / (acc_groups * wg_size);
254-
size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
243+
const size_t iter_gid = gid / (acc_groups * wg_size);
244+
const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
255245

256246
std::array<outputT, n_wi> local_iscan;
257247

258248
size_t i = chunk_gid * n_wi;
259-
auto iter_offsets = iter_indexer(iter_gid);
249+
const auto &iter_offsets = iter_indexer(iter_gid);
260250
const auto &inp_iter_offset = iter_offsets.get_first_offset();
261251
const auto &out_iter_offset = iter_offsets.get_second_offset();
262252

@@ -377,7 +367,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
377367

378368
sycl::event dependent_event = inc_scan_phase1_ev;
379369
if (n_groups > 1) {
380-
auto chunk_size = wg_size * n_wi;
370+
const size_t chunk_size = wg_size * n_wi;
381371

382372
// how much of temporary allocation do we need
383373
size_t n_groups_ = n_groups;
@@ -407,7 +397,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
407397
size_t size_to_update = n_elems;
408398
while (n_groups_ > 1) {
409399

410-
size_t src_size = n_groups_ - 1;
400+
const size_t src_size = n_groups_ - 1;
411401
dependent_event =
412402
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
413403
NoOpIndexerT, NoOpIndexerT,
@@ -426,19 +416,19 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
426416
for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
427417
++reverse_stack_id)
428418
{
429-
auto stack_id = stack.size() - 1 - reverse_stack_id;
419+
const size_t stack_id = stack.size() - 1 - reverse_stack_id;
430420

431-
auto stack_elem = stack[stack_id];
421+
const auto &stack_elem = stack[stack_id];
432422
outputT *src = stack_elem.get_src_ptr();
433-
size_t src_size = stack_elem.get_size();
423+
const size_t src_size = stack_elem.get_size();
434424
outputT *local_scans = stack_elem.get_local_scans_ptr();
435425

436426
// output[ chunk_size * (i + 1) + j] += temp[i]
437427
dependent_event = exec_q.submit([&](sycl::handler &cgh) {
438428
cgh.depends_on(dependent_event);
439429

440430
constexpr nwiT updates_per_wi = n_wi;
441-
size_t n_items = ceiling_quotient<size_t>(src_size, n_wi);
431+
const size_t n_items = ceiling_quotient<size_t>(src_size, n_wi);
442432

443433
using UpdateKernelName =
444434
class inclusive_scan_1d_iter_chunk_update_krn<
@@ -448,12 +438,12 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
448438
cgh.parallel_for<UpdateKernelName>(
449439
{n_items}, [chunk_size, src, src_size, local_scans, scan_op,
450440
identity](auto wiid) {
451-
auto gid = n_wi * wiid[0];
441+
const size_t gid = n_wi * wiid[0];
452442
#pragma unroll
453-
for (auto i = 0; i < updates_per_wi; ++i) {
454-
auto src_id = gid + i;
443+
for (size_t i = 0; i < updates_per_wi; ++i) {
444+
const size_t src_id = gid + i;
455445
if (src_id < src_size) {
456-
auto scan_id = (src_id / chunk_size);
446+
const size_t scan_id = (src_id / chunk_size);
457447
src[src_id] =
458448
(scan_id > 0)
459449
? scan_op(src[src_id],
@@ -511,7 +501,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
511501
const sycl::device &dev = q.get_device();
512502
if (dev.has(sycl::aspect::cpu)) {
513503
constexpr nwiT n_wi_for_cpu = 8;
514-
size_t wg_size = 256;
504+
const size_t wg_size = 256;
515505
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
516506
transformerT, AccumulateOpT,
517507
include_initial>(
@@ -520,7 +510,7 @@ accumulate_1d_contig_impl(sycl::queue &q,
520510
}
521511
else {
522512
constexpr nwiT n_wi_for_gpu = 4;
523-
size_t wg_size = 256;
513+
const size_t wg_size = 256;
524514
comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
525515
transformerT, AccumulateOpT,
526516
include_initial>(
@@ -586,13 +576,13 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
586576

587577
sycl::event dependent_event = inc_scan_phase1_ev;
588578
if (acc_groups > 1) {
589-
auto chunk_size = wg_size * n_wi;
579+
const size_t chunk_size = wg_size * n_wi;
590580

591581
// how much of temporary allocation do we need
592582
size_t acc_groups_ = acc_groups;
593583
size_t temp_size = 0;
594584
while (acc_groups_ > 1) {
595-
const auto this_size = (acc_groups_ - 1);
585+
const size_t this_size = (acc_groups_ - 1);
596586
temp_size += this_size;
597587
acc_groups_ = ceiling_quotient<size_t>(this_size, chunk_size);
598588
}
@@ -622,14 +612,15 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
622612
size_t src_size = acc_groups - 1;
623613
using LocalScanIndexerT =
624614
dpctl::tensor::offset_utils::Strided1DIndexer;
625-
LocalScanIndexerT scan_iter_indexer{
615+
const LocalScanIndexerT scan_iter_indexer{
626616
0, static_cast<ssize_t>(iter_nelems),
627617
static_cast<ssize_t>(src_size)};
628618

629619
using IterIndexerT =
630620
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
631621
OutIterIndexerT, LocalScanIndexerT>;
632-
IterIndexerT iter_indexer_{out_iter_indexer, scan_iter_indexer};
622+
const IterIndexerT iter_indexer_{out_iter_indexer,
623+
scan_iter_indexer};
633624

634625
dependent_event =
635626
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
@@ -651,17 +642,18 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
651642

652643
using LocalScanIndexerT =
653644
dpctl::tensor::offset_utils::Strided1DIndexer;
654-
LocalScanIndexerT scan1_iter_indexer{
645+
const LocalScanIndexerT scan1_iter_indexer{
655646
0, static_cast<ssize_t>(iter_nelems),
656647
static_cast<ssize_t>(size_to_update)};
657-
LocalScanIndexerT scan2_iter_indexer{
648+
const LocalScanIndexerT scan2_iter_indexer{
658649
0, static_cast<ssize_t>(iter_nelems),
659650
static_cast<ssize_t>(src_size)};
660651

661652
using IterIndexerT =
662653
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
663654
LocalScanIndexerT, LocalScanIndexerT>;
664-
IterIndexerT iter_indexer_{scan1_iter_indexer, scan2_iter_indexer};
655+
const IterIndexerT iter_indexer_{scan1_iter_indexer,
656+
scan2_iter_indexer};
665657

666658
dependent_event =
667659
inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
@@ -681,16 +673,16 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
681673
for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size() - 1;
682674
++reverse_stack_id)
683675
{
684-
auto stack_id = stack.size() - 1 - reverse_stack_id;
676+
const size_t stack_id = stack.size() - 1 - reverse_stack_id;
685677

686-
auto stack_elem = stack[stack_id];
678+
const auto &stack_elem = stack[stack_id];
687679
outputT *src = stack_elem.get_src_ptr();
688680
size_t src_size = stack_elem.get_size();
689681
outputT *local_scans = stack_elem.get_local_scans_ptr();
690682
size_t local_stride = stack_elem.get_local_stride();
691683

692684
constexpr nwiT updates_per_wi = n_wi;
693-
size_t update_nelems =
685+
const size_t update_nelems =
694686
ceiling_quotient<size_t>(src_size, updates_per_wi);
695687

696688
dependent_event = exec_q.submit([&](sycl::handler &cgh) {
@@ -705,21 +697,23 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
705697
{iter_nelems * update_nelems},
706698
[chunk_size, update_nelems, src_size, local_stride, src,
707699
local_scans, scan_op, identity](auto wiid) {
708-
size_t gid = wiid[0];
700+
const size_t gid = wiid[0];
709701

710-
size_t iter_gid = gid / update_nelems;
711-
size_t axis_gid = gid - (iter_gid * update_nelems);
702+
const size_t iter_gid = gid / update_nelems;
703+
const size_t axis_gid =
704+
gid - (iter_gid * update_nelems);
712705

713-
size_t src_axis_id0 = axis_gid * updates_per_wi;
714-
size_t src_iter_id = iter_gid * src_size;
706+
const size_t src_axis_id0 = axis_gid * updates_per_wi;
707+
const size_t src_iter_id = iter_gid * src_size;
715708
#pragma unroll
716709
for (nwiT i = 0; i < updates_per_wi; ++i) {
717-
size_t src_axis_id = src_axis_id0 + i;
718-
size_t src_id = src_axis_id + src_iter_id;
710+
const size_t src_axis_id = src_axis_id0 + i;
711+
const size_t src_id = src_axis_id + src_iter_id;
719712

720713
if (src_axis_id < src_size) {
721-
size_t scan_axis_id = src_axis_id / chunk_size;
722-
size_t scan_id =
714+
const size_t scan_axis_id =
715+
src_axis_id / chunk_size;
716+
const size_t scan_id =
723717
scan_axis_id + iter_gid * local_stride;
724718

725719
src[src_id] =
@@ -735,14 +729,14 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
735729

736730
// last stack element is always directly to output
737731
{
738-
auto stack_elem = stack[0];
732+
const auto &stack_elem = stack[0];
739733
outputT *src = stack_elem.get_src_ptr();
740-
size_t src_size = stack_elem.get_size();
734+
const size_t src_size = stack_elem.get_size();
741735
outputT *local_scans = stack_elem.get_local_scans_ptr();
742-
size_t local_stride = stack_elem.get_local_stride();
736+
const size_t local_stride = stack_elem.get_local_stride();
743737

744738
constexpr nwiT updates_per_wi = n_wi;
745-
size_t update_nelems =
739+
const size_t update_nelems =
746740
ceiling_quotient<size_t>(src_size, updates_per_wi);
747741

748742
dependent_event = exec_q.submit([&](sycl::handler &cgh) {
@@ -759,22 +753,24 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
759753
[chunk_size, update_nelems, src_size, local_stride, src,
760754
local_scans, scan_op, identity, out_iter_indexer,
761755
out_indexer](auto wiid) {
762-
size_t gid = wiid[0];
756+
const size_t gid = wiid[0];
763757

764-
size_t iter_gid = gid / update_nelems;
765-
size_t axis_gid = gid - (iter_gid * update_nelems);
758+
const size_t iter_gid = gid / update_nelems;
759+
const size_t axis_gid =
760+
gid - (iter_gid * update_nelems);
766761

767-
size_t src_axis_id0 = axis_gid * updates_per_wi;
768-
size_t src_iter_id = out_iter_indexer(iter_gid);
762+
const size_t src_axis_id0 = axis_gid * updates_per_wi;
763+
const size_t src_iter_id = out_iter_indexer(iter_gid);
769764
#pragma unroll
770765
for (nwiT i = 0; i < updates_per_wi; ++i) {
771-
size_t src_axis_id = src_axis_id0 + i;
772-
size_t src_id =
766+
const size_t src_axis_id = src_axis_id0 + i;
767+
const size_t src_id =
773768
out_indexer(src_axis_id) + src_iter_id;
774769

775770
if (src_axis_id < src_size) {
776-
size_t scan_axis_id = src_axis_id / chunk_size;
777-
size_t scan_id =
771+
const size_t scan_axis_id =
772+
src_axis_id / chunk_size;
773+
const size_t scan_id =
778774
scan_axis_id + iter_gid * local_stride;
779775

780776
src[src_id] =
@@ -858,7 +854,7 @@ accumulate_strided_impl(sycl::queue &q,
858854
sycl::event comp_ev;
859855
if (dev.has(sycl::aspect::cpu)) {
860856
constexpr nwiT n_wi_for_cpu = 8;
861-
size_t wg_size = 256;
857+
const size_t wg_size = 256;
862858
comp_ev =
863859
inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
864860
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -869,7 +865,7 @@ accumulate_strided_impl(sycl::queue &q,
869865
}
870866
else {
871867
constexpr nwiT n_wi_for_gpu = 4;
872-
size_t wg_size = 256;
868+
const size_t wg_size = 256;
873869
comp_ev =
874870
inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
875871
OutIndexerT, InpIndexerT, OutIndexerT,
@@ -914,7 +910,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
914910
const sycl::device &dev = q.get_device();
915911
if (dev.has(sycl::aspect::cpu)) {
916912
constexpr nwiT n_wi_for_cpu = 8;
917-
size_t wg_size = 256;
913+
const size_t wg_size = 256;
918914
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
919915
NoOpIndexerT, transformerT,
920916
AccumulateOpT, include_initial>(
@@ -923,7 +919,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q,
923919
}
924920
else {
925921
constexpr nwiT n_wi_for_gpu = 4;
926-
size_t wg_size = 256;
922+
const size_t wg_size = 256;
927923
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
928924
NoOpIndexerT, transformerT,
929925
AccumulateOpT, include_initial>(
@@ -1022,7 +1018,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
10221018
sycl::event comp_ev;
10231019
if (dev.has(sycl::aspect::cpu)) {
10241020
constexpr nwiT n_wi_for_cpu = 8;
1025-
size_t wg_size = 256;
1021+
const size_t wg_size = 256;
10261022
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
10271023
StridedIndexerT, transformerT,
10281024
AccumulateOpT, include_initial>(
@@ -1031,7 +1027,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q,
10311027
}
10321028
else {
10331029
constexpr nwiT n_wi_for_gpu = 4;
1034-
size_t wg_size = 256;
1030+
const size_t wg_size = 256;
10351031
comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
10361032
StridedIndexerT, transformerT,
10371033
AccumulateOpT, include_initial>(

0 commit comments

Comments
 (0)