Skip to content

Commit 9d77faf

Browse files
Change to device_allocate_and_pack to return unique_ptr
The unique_ptr owns the allocation ensuring no leaks during exception handling. This also allows async_smart_free to be used to schedule asynchronous deallocation of USM temporaries.
1 parent 9841f9e commit 9d77faf

21 files changed

+392
-745
lines changed

dpctl/tensor/libtensor/include/utils/offset_utils.hpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,13 @@
2828

2929
#include <algorithm>
3030
#include <cstddef>
31-
#include <sycl/sycl.hpp>
31+
#include <memory> // for std::make_shared, std::unique_ptr
3232
#include <tuple>
33+
#include <utility> // for std::move, std::forward
3334
#include <vector>
3435

36+
#include <sycl/sycl.hpp>
37+
3538
#include "kernels/dpctl_tensor_types.hpp"
3639
#include "utils/strided_iters.hpp"
3740
#include "utils/sycl_alloc_utils.hpp"
@@ -84,7 +87,9 @@ std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
8487
} // namespace detail
8588

8689
template <typename indT, typename... Vs>
87-
std::tuple<indT *, std::size_t, sycl::event>
90+
std::tuple<std::unique_ptr<indT, dpctl::tensor::alloc_utils::USMDeleter>,
91+
std::size_t,
92+
sycl::event>
8893
device_allocate_and_pack(sycl::queue &q,
8994
std::vector<sycl::event> &host_task_events,
9095
Vs &&...vs)
@@ -105,25 +110,24 @@ device_allocate_and_pack(sycl::queue &q,
105110
std::make_shared<shT>(std::move(packed_shape_strides));
106111

107112
auto sz = packed_shape_strides_owner->size();
108-
indT *shape_strides = sycl::malloc_device<indT>(sz, q);
109-
110-
if (shape_strides == nullptr) {
111-
return std::make_tuple(shape_strides, 0, sycl::event());
112-
}
113+
auto shape_strides_owner =
114+
dpctl::tensor::alloc_utils::smart_malloc_device<indT>(sz, q);
115+
indT *shape_strides = shape_strides_owner.get();
113116

114117
sycl::event copy_ev =
115118
q.copy<indT>(packed_shape_strides_owner->data(), shape_strides, sz);
116119

117120
sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) {
118121
cgh.depends_on(copy_ev);
119-
cgh.host_task([packed_shape_strides_owner] {
122+
cgh.host_task([packed_shape_strides_owner =
123+
std::move(packed_shape_strides_owner)] {
120124
// increment shared pointer ref-count to keep it alive
121125
// till copy operation completes;
122126
});
123127
});
124128
host_task_events.push_back(cleanup_host_task_ev);
125129

126-
return std::make_tuple(shape_strides, sz, copy_ev);
130+
return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev);
127131
}
128132

129133
struct NoOpIndexer

dpctl/tensor/libtensor/source/accumulators.cpp

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -196,14 +196,11 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
196196
: mask_positions_strided_i64_dispatch_vector[mask_typeid];
197197

198198
using dpctl::tensor::offset_utils::device_allocate_and_pack;
199-
const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
199+
auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
200200
exec_q, host_task_events, compact_shape, compact_strides);
201-
py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
202-
if (shape_strides == nullptr) {
203-
sycl::event::wait(host_task_events);
204-
throw std::runtime_error("Unexpected error");
205-
}
201+
auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
206202
sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
203+
const py::ssize_t *shape_strides = shape_strides_owner.get();
207204

208205
if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
209206
{
@@ -212,8 +209,8 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
212209
copy_shape_ev.wait();
213210
sycl::event::wait(host_task_events);
214211

215-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
216-
sycl_free_noexcept(shape_strides, exec_q);
212+
// ensure deleter of smart pointer is invoked with GIL released
213+
shape_strides_owner.release();
217214
}
218215
throw std::runtime_error("Unexpected error");
219216
}
@@ -233,8 +230,8 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
233230
cumsum_data, host_task_events, dependent_events);
234231

235232
sycl::event::wait(host_task_events);
236-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
237-
sycl_free_noexcept(shape_strides, exec_q);
233+
// ensure deleter of smart pointer is invoked with GIL released
234+
shape_strides_owner.release();
238235
}
239236

240237
return total_set;
@@ -356,24 +353,22 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
356353
}
357354

358355
using dpctl::tensor::offset_utils::device_allocate_and_pack;
359-
const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
356+
auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
360357
exec_q, host_task_events, compact_shape, compact_strides);
361-
py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
362-
if (shape_strides == nullptr) {
363-
sycl::event::wait(host_task_events);
364-
throw std::runtime_error("Unexpected error");
365-
}
358+
auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
366359
sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
360+
const py::ssize_t *shape_strides = shape_strides_owner.get();
367361

368362
if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
369363
{
370364
py::gil_scoped_release release;
371365

372366
copy_shape_ev.wait();
373367
sycl::event::wait(host_task_events);
368+
369+
// ensure USM deleter is called with GIL released
370+
shape_strides_owner.release();
374371
}
375-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
376-
sycl_free_noexcept(shape_strides, exec_q);
377372
throw std::runtime_error("Unexpected error");
378373
}
379374

@@ -391,8 +386,8 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
391386
py::gil_scoped_release release;
392387
sycl::event::wait(host_task_events);
393388

394-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
395-
sycl_free_noexcept(shape_strides, exec_q);
389+
// ensure USM deleter is called with GIL released
390+
shape_strides_owner.release();
396391
}
397392

398393
return total;

dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -200,18 +200,18 @@ py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src,
200200
}
201201

202202
using dpctl::tensor::offset_utils::device_allocate_and_pack;
203-
const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
203+
auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
204204
exec_q, host_task_events, simplified_iter_shape,
205205
simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
206206
acc_src_strides, acc_dst_strides);
207-
py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple);
208-
if (packed_shapes_and_strides == nullptr) {
209-
throw std::runtime_error("Unexpected error");
210-
}
207+
auto packed_shapes_and_strides_owner =
208+
std::move(std::get<0>(ptr_size_event_tuple));
211209
const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
210+
const py::ssize_t *packed_shapes_and_strides =
211+
packed_shapes_and_strides_owner.get();
212212

213-
py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
214-
py::ssize_t *acc_shapes_and_strides =
213+
const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
214+
const py::ssize_t *acc_shapes_and_strides =
215215
packed_shapes_and_strides + 3 * simplified_iter_shape.size();
216216

217217
std::vector<sycl::event> all_deps;
@@ -224,14 +224,8 @@ py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src,
224224
iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
225225
acc_shapes_and_strides, dst_data, host_task_events, all_deps);
226226

227-
sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
228-
cgh.depends_on(acc_ev);
229-
const auto &ctx = exec_q.get_context();
230-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
231-
cgh.host_task([ctx, packed_shapes_and_strides] {
232-
sycl_free_noexcept(packed_shapes_and_strides, ctx);
233-
});
234-
});
227+
sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
228+
exec_q, {acc_ev}, packed_shapes_and_strides_owner);
235229
host_task_events.push_back(temp_cleanup_ev);
236230

237231
return std::make_pair(
@@ -384,18 +378,18 @@ std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
384378
}
385379

386380
using dpctl::tensor::offset_utils::device_allocate_and_pack;
387-
const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
381+
auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
388382
exec_q, host_task_events, simplified_iter_shape,
389383
simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
390384
acc_src_strides, acc_dst_strides);
391-
py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple);
392-
if (packed_shapes_and_strides == nullptr) {
393-
throw std::runtime_error("Unexpected error");
394-
}
385+
auto packed_shapes_and_strides_owner =
386+
std::move(std::get<0>(ptr_size_event_tuple));
395387
const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
388+
const py::ssize_t *packed_shapes_and_strides =
389+
packed_shapes_and_strides_owner.get();
396390

397-
py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
398-
py::ssize_t *acc_shapes_and_strides =
391+
const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
392+
const py::ssize_t *acc_shapes_and_strides =
399393
packed_shapes_and_strides + 3 * simplified_iter_shape.size();
400394

401395
std::vector<sycl::event> all_deps;
@@ -408,14 +402,8 @@ std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
408402
iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
409403
acc_shapes_and_strides, dst_data, host_task_events, all_deps);
410404

411-
sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
412-
cgh.depends_on(acc_ev);
413-
const auto &ctx = exec_q.get_context();
414-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
415-
cgh.host_task([ctx, packed_shapes_and_strides] {
416-
sycl_free_noexcept(packed_shapes_and_strides, ctx);
417-
});
418-
});
405+
sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
406+
exec_q, {acc_ev}, packed_shapes_and_strides_owner);
419407
host_task_events.push_back(temp_cleanup_ev);
420408

421409
return std::make_pair(

0 commit comments

Comments
 (0)