Skip to content

Commit 66b16ee

Browse files
Merge pull request #1421 from IntelPython/avoid-needless-copy
Avoid needless copies
2 parents 8de5201 + 15f9320 commit 66b16ee

File tree

119 files changed

+891
-964
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+891
-964
lines changed

dpctl/apis/include/dpctl4pybind11.hpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <complex>
3131
#include <memory>
3232
#include <pybind11/pybind11.h>
33+
#include <utility>
3334
#include <vector>
3435

3536
namespace py = pybind11;
@@ -369,19 +370,19 @@ class dpctl_capi
369370
sycl::queue q_{};
370371
PySyclQueueObject *py_q_tmp =
371372
SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&q_));
372-
py::object py_sycl_queue = py::reinterpret_steal<py::object>(
373+
const py::object &py_sycl_queue = py::reinterpret_steal<py::object>(
373374
reinterpret_cast<PyObject *>(py_q_tmp));
374375

375376
default_sycl_queue_ = std::shared_ptr<py::object>(
376377
new py::object(py_sycl_queue), Deleter{});
377378

378379
py::module_ mod_memory = py::module_::import("dpctl.memory");
379-
py::object py_as_usm_memory = mod_memory.attr("as_usm_memory");
380+
const py::object &py_as_usm_memory = mod_memory.attr("as_usm_memory");
380381
as_usm_memory_ = std::shared_ptr<py::object>(
381382
new py::object{py_as_usm_memory}, Deleter{});
382383

383384
auto mem_kl = mod_memory.attr("MemoryUSMHost");
384-
py::object py_default_usm_memory =
385+
const py::object &py_default_usm_memory =
385386
mem_kl(1, py::arg("queue") = py_sycl_queue);
386387
default_usm_memory_ = std::shared_ptr<py::object>(
387388
new py::object{py_default_usm_memory}, Deleter{});
@@ -390,7 +391,7 @@ class dpctl_capi
390391
py::module_::import("dpctl.tensor._usmarray");
391392
auto tensor_kl = mod_usmarray.attr("usm_ndarray");
392393

393-
py::object py_default_usm_ndarray =
394+
const py::object &py_default_usm_ndarray =
394395
tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
395396
py::arg("buffer") = py_default_usm_memory);
396397

@@ -1032,7 +1033,7 @@ namespace utils
10321033
{
10331034

10341035
template <std::size_t num>
1035-
sycl::event keep_args_alive(sycl::queue q,
1036+
sycl::event keep_args_alive(sycl::queue &q,
10361037
const py::object (&py_objs)[num],
10371038
const std::vector<sycl::event> &depends = {})
10381039
{
@@ -1043,7 +1044,7 @@ sycl::event keep_args_alive(sycl::queue q,
10431044
shp_arr[i] = std::make_shared<py::handle>(py_objs[i]);
10441045
shp_arr[i]->inc_ref();
10451046
}
1046-
cgh.host_task([=]() {
1047+
cgh.host_task([shp_arr = std::move(shp_arr)]() {
10471048
py::gil_scoped_acquire acquire;
10481049

10491050
for (std::size_t i = 0; i < num; ++i) {
@@ -1058,7 +1059,7 @@ sycl::event keep_args_alive(sycl::queue q,
10581059
/*! @brief Check if all allocation queues are the same as the
10591060
execution queue */
10601061
template <std::size_t num>
1061-
bool queues_are_compatible(sycl::queue exec_q,
1062+
bool queues_are_compatible(const sycl::queue &exec_q,
10621063
const sycl::queue (&alloc_qs)[num])
10631064
{
10641065
for (std::size_t i = 0; i < num; ++i) {
@@ -1073,7 +1074,7 @@ bool queues_are_compatible(sycl::queue exec_q,
10731074
/*! @brief Check if all allocation queues of usm_ndarays are the same as
10741075
the execution queue */
10751076
template <std::size_t num>
1076-
bool queues_are_compatible(sycl::queue exec_q,
1077+
bool queues_are_compatible(const sycl::queue &exec_q,
10771078
const ::dpctl::tensor::usm_ndarray (&arrs)[num])
10781079
{
10791080
for (std::size_t i = 0; i < num; ++i) {

dpctl/tensor/libtensor/include/kernels/accumulators.hpp

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ template <typename inputT,
103103
size_t n_wi,
104104
typename IndexerT,
105105
typename TransformerT>
106-
sycl::event inclusive_scan_rec(sycl::queue exec_q,
106+
sycl::event inclusive_scan_rec(sycl::queue &exec_q,
107107
size_t n_elems,
108108
size_t wg_size,
109109
const inputT *input,
@@ -116,19 +116,20 @@ sycl::event inclusive_scan_rec(sycl::queue exec_q,
116116
{
117117
size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
118118

119-
sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
120-
cgh.depends_on(depends);
119+
const sycl::event &inc_scan_phase1_ev =
120+
exec_q.submit([&](sycl::handler &cgh) {
121+
cgh.depends_on(depends);
121122

122-
using slmT = sycl::local_accessor<size_t, 1>;
123+
using slmT = sycl::local_accessor<size_t, 1>;
123124

124-
auto lws = sycl::range<1>(wg_size);
125-
auto gws = sycl::range<1>(n_groups * wg_size);
125+
auto lws = sycl::range<1>(wg_size);
126+
auto gws = sycl::range<1>(n_groups * wg_size);
126127

127-
slmT slm_iscan_tmp(lws, cgh);
128+
slmT slm_iscan_tmp(lws, cgh);
128129

129130
cgh.parallel_for<class inclusive_scan_rec_local_scan_krn<
130131
inputT, outputT, n_wi, IndexerT, decltype(transformer)>>(
131-
sycl::nd_range<1>(gws, lws), [=](sycl::nd_item<1> it)
132+
sycl::nd_range<1>(gws, lws), [=, slm_iscan_tmp = std::move(slm_iscan_tmp)](sycl::nd_item<1> it)
132133
{
133134
auto chunk_gid = it.get_global_id(0);
134135
auto lid = it.get_local_id(0);
@@ -172,7 +173,7 @@ sycl::event inclusive_scan_rec(sycl::queue exec_q,
172173
output[i + m_wi] = local_isum[m_wi];
173174
}
174175
});
175-
});
176+
});
176177

177178
sycl::event out_event = inc_scan_phase1_ev;
178179
if (n_groups > 1) {
@@ -203,25 +204,25 @@ sycl::event inclusive_scan_rec(sycl::queue exec_q,
203204

204205
sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) {
205206
cgh.depends_on(e3);
206-
auto ctx = exec_q.get_context();
207+
const auto &ctx = exec_q.get_context();
207208
cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
208209
});
209210

210-
out_event = e4;
211+
out_event = std::move(e4);
211212
}
212213

213214
return out_event;
214215
}
215216

216217
typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
217-
sycl::queue,
218+
sycl::queue &,
218219
size_t,
219220
const char *,
220221
char *,
221222
std::vector<sycl::event> const &);
222223

223224
template <typename maskT, typename cumsumT, typename transformerT>
224-
size_t accumulate_contig_impl(sycl::queue q,
225+
size_t accumulate_contig_impl(sycl::queue &q,
225226
size_t n_elems,
226227
const char *mask,
227228
char *cumsum,
@@ -235,7 +236,7 @@ size_t accumulate_contig_impl(sycl::queue q,
235236
NoOpIndexer flat_indexer{};
236237
transformerT non_zero_indicator{};
237238

238-
sycl::event comp_ev =
239+
const sycl::event &comp_ev =
239240
inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(flat_indexer),
240241
decltype(non_zero_indicator)>(
241242
q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
@@ -296,7 +297,7 @@ template <typename fnT, typename T> struct Cumsum1DContigFactory
296297
};
297298

298299
typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
299-
sycl::queue,
300+
sycl::queue &,
300301
size_t,
301302
const char *,
302303
int,
@@ -305,7 +306,7 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
305306
std::vector<sycl::event> const &);
306307

307308
template <typename maskT, typename cumsumT, typename transformerT>
308-
size_t accumulate_strided_impl(sycl::queue q,
309+
size_t accumulate_strided_impl(sycl::queue &q,
309310
size_t n_elems,
310311
const char *mask,
311312
int nd,
@@ -321,7 +322,7 @@ size_t accumulate_strided_impl(sycl::queue q,
321322
StridedIndexer strided_indexer{nd, 0, shape_strides};
322323
transformerT non_zero_indicator{};
323324

324-
sycl::event comp_ev =
325+
const sycl::event &comp_ev =
325326
inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(strided_indexer),
326327
decltype(non_zero_indicator)>(
327328
q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,

dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ template <typename OrthoIndexerT,
198198
class masked_extract_all_slices_strided_impl_krn;
199199

200200
typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
201-
sycl::queue,
201+
sycl::queue &,
202202
py::ssize_t,
203203
const char *,
204204
const char *,
@@ -211,7 +211,7 @@ typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
211211

212212
template <typename dataT, typename indT>
213213
sycl::event masked_extract_all_slices_strided_impl(
214-
sycl::queue exec_q,
214+
sycl::queue &exec_q,
215215
py::ssize_t iteration_size,
216216
const char *src_p,
217217
const char *cumsum_p,
@@ -253,7 +253,7 @@ sycl::event masked_extract_all_slices_strided_impl(
253253
}
254254

255255
typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
256-
sycl::queue,
256+
sycl::queue &,
257257
py::ssize_t,
258258
py::ssize_t,
259259
const char *,
@@ -278,7 +278,7 @@ class masked_extract_some_slices_strided_impl_krn;
278278

279279
template <typename dataT, typename indT>
280280
sycl::event masked_extract_some_slices_strided_impl(
281-
sycl::queue exec_q,
281+
sycl::queue &exec_q,
282282
py::ssize_t orthog_nelems,
283283
py::ssize_t masked_nelems,
284284
const char *src_p,
@@ -380,7 +380,7 @@ template <typename OrthoIndexerT,
380380
class masked_place_all_slices_strided_impl_krn;
381381

382382
typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
383-
sycl::queue,
383+
sycl::queue &,
384384
py::ssize_t,
385385
char *,
386386
const char *,
@@ -393,7 +393,7 @@ typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
393393

394394
template <typename dataT, typename indT>
395395
sycl::event masked_place_all_slices_strided_impl(
396-
sycl::queue exec_q,
396+
sycl::queue &exec_q,
397397
py::ssize_t iteration_size,
398398
char *dst_p,
399399
const char *cumsum_p,
@@ -430,7 +430,7 @@ sycl::event masked_place_all_slices_strided_impl(
430430
}
431431

432432
typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
433-
sycl::queue,
433+
sycl::queue &,
434434
py::ssize_t,
435435
py::ssize_t,
436436
char *,
@@ -455,7 +455,7 @@ class masked_place_some_slices_strided_impl_krn;
455455

456456
template <typename dataT, typename indT>
457457
sycl::event masked_place_some_slices_strided_impl(
458-
sycl::queue exec_q,
458+
sycl::queue &exec_q,
459459
py::ssize_t orthog_nelems,
460460
py::ssize_t masked_nelems,
461461
char *dst_p,
@@ -549,7 +549,7 @@ struct MaskPlaceSomeSlicesStridedFactoryForInt64
549549
template <typename T1, typename T2> class non_zero_indexes_krn;
550550

551551
typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
552-
sycl::queue,
552+
sycl::queue &,
553553
py::ssize_t,
554554
py::ssize_t,
555555
int,
@@ -559,7 +559,7 @@ typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
559559
std::vector<sycl::event> const &);
560560

561561
template <typename indT1, typename indT2>
562-
sycl::event non_zero_indexes_impl(sycl::queue exec_q,
562+
sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
563563
py::ssize_t iter_size,
564564
py::ssize_t nz_elems,
565565
int nd,

dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ struct ContigBooleanReduction
244244
};
245245

246246
typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
247-
sycl::queue,
247+
sycl::queue &,
248248
size_t,
249249
size_t,
250250
const char *,
@@ -264,7 +264,7 @@ using dpctl::tensor::sycl_utils::choose_workgroup_size;
264264

265265
template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
266266
sycl::event
267-
boolean_reduction_axis1_contig_impl(sycl::queue exec_q,
267+
boolean_reduction_axis1_contig_impl(sycl::queue &exec_q,
268268
size_t iter_nelems,
269269
size_t reduction_nelems,
270270
const char *arg_cp,
@@ -463,7 +463,7 @@ class boolean_reduction_axis0_contig_krn;
463463

464464
template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
465465
sycl::event
466-
boolean_reduction_axis0_contig_impl(sycl::queue exec_q,
466+
boolean_reduction_axis0_contig_impl(sycl::queue &exec_q,
467467
size_t iter_nelems,
468468
size_t reduction_nelems,
469469
const char *arg_cp,
@@ -572,7 +572,7 @@ template <typename T1, typename T2, typename T3, typename T4, typename T5>
572572
class boolean_reduction_seq_strided_krn;
573573

574574
typedef sycl::event (*boolean_reduction_strided_impl_fn_ptr)(
575-
sycl::queue,
575+
sycl::queue &,
576576
size_t,
577577
size_t,
578578
const char *,
@@ -588,7 +588,7 @@ typedef sycl::event (*boolean_reduction_strided_impl_fn_ptr)(
588588

589589
template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
590590
sycl::event
591-
boolean_reduction_strided_impl(sycl::queue exec_q,
591+
boolean_reduction_strided_impl(sycl::queue &exec_q,
592592
size_t iter_nelems,
593593
size_t reduction_nelems,
594594
const char *arg_cp,

0 commit comments

Comments
 (0)