Skip to content

Commit fea54b6

Browse files
Replace sycl::malloc_device with smart_malloc_device
Direct calls to host_task to asynchronously deallocate USM temporary are replaced with call to async_smart_free which submits the host_task for us and transfers allocation ownership from smart pointer to the host task.
1 parent ce02c6c commit fea54b6

File tree

5 files changed

+256
-497
lines changed

5 files changed

+256
-497
lines changed

dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -912,11 +912,11 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
912912
*(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
913913

914914
std::size_t n1_padded = n1 + max_sgSize;
915-
argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
915+
auto padded_vec_owner =
916+
dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
917+
exec_q);
918+
argT2 *padded_vec = padded_vec_owner.get();
916919

917-
if (padded_vec == nullptr) {
918-
throw std::runtime_error("Could not allocate memory on the device");
919-
}
920920
sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
921921
cgh.depends_on(depends); // ensure vec contains actual data
922922
cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
@@ -948,13 +948,9 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
948948
mat, padded_vec, res, n_elems, n1));
949949
});
950950

951-
sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
952-
cgh.depends_on(comp_ev);
953-
const sycl::context &ctx = exec_q.get_context();
954-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
955-
cgh.host_task(
956-
[ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); });
957-
});
951+
sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
952+
exec_q, {comp_ev}, padded_vec_owner);
953+
958954
host_tasks.push_back(tmp_cleanup_ev);
959955

960956
return comp_ev;
@@ -992,11 +988,10 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
992988
*(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
993989

994990
std::size_t n1_padded = n1 + max_sgSize;
995-
argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
996-
997-
if (padded_vec == nullptr) {
998-
throw std::runtime_error("Could not allocate memory on the device");
999-
}
991+
auto padded_vec_owner =
992+
dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
993+
exec_q);
994+
argT2 *padded_vec = padded_vec_owner.get();
1000995

1001996
sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
1002997
cgh.depends_on(depends); // ensure vec contains actual data
@@ -1029,13 +1024,9 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
10291024
padded_vec, mat, res, n_elems, n1));
10301025
});
10311026

1032-
sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
1033-
cgh.depends_on(comp_ev);
1034-
const sycl::context &ctx = exec_q.get_context();
1035-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
1036-
cgh.host_task(
1037-
[ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); });
1038-
});
1027+
sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
1028+
exec_q, {comp_ev}, padded_vec_owner);
1029+
10391030
host_tasks.push_back(tmp_cleanup_ev);
10401031

10411032
return comp_ev;

dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -423,11 +423,11 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
423423
*(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
424424

425425
std::size_t n1_padded = n1 + max_sgSize;
426-
argT *padded_vec = sycl::malloc_device<argT>(n1_padded, exec_q);
426+
auto padded_vec_owner =
427+
dpctl::tensor::alloc_utils::smart_malloc_device<argT>(n1_padded,
428+
exec_q);
429+
argT *padded_vec = padded_vec_owner.get();
427430

428-
if (padded_vec == nullptr) {
429-
throw std::runtime_error("Could not allocate memory on the device");
430-
}
431431
sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
432432
cgh.depends_on(depends); // ensure vec contains actual data
433433
cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
@@ -459,13 +459,8 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
459459
n_elems, n1));
460460
});
461461

462-
sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
463-
cgh.depends_on(comp_ev);
464-
const sycl::context &ctx = exec_q.get_context();
465-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
466-
cgh.host_task(
467-
[ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); });
468-
});
462+
sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
463+
exec_q, {comp_ev}, padded_vec_owner);
469464
host_tasks.push_back(tmp_cleanup_ev);
470465

471466
return comp_ev;

dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp

Lines changed: 22 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,18 +1026,15 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
10261026
(reduction_groups + preferred_reductions_per_wi * wg - 1) /
10271027
(preferred_reductions_per_wi * wg);
10281028

1029-
resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
1030-
batches * (reduction_groups + second_iter_reduction_groups_),
1031-
exec_q);
1032-
resTy *partially_reduced_tmp2 = nullptr;
1029+
// returns unique_ptr
1030+
auto partially_reduced_tmp_owner =
1031+
dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
1032+
batches * (reduction_groups + second_iter_reduction_groups_),
1033+
exec_q);
10331034

1034-
if (partially_reduced_tmp == nullptr) {
1035-
throw std::runtime_error("Unable to allocate device_memory");
1036-
}
1037-
else {
1038-
partially_reduced_tmp2 =
1039-
partially_reduced_tmp + reduction_groups * batches;
1040-
}
1035+
resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
1036+
resTy *partially_reduced_tmp2 =
1037+
partially_reduced_tmp + reduction_groups * batches;
10411038

10421039
sycl::event first_reduction_ev;
10431040
{
@@ -1152,16 +1149,10 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
11521149
remaining_reduction_nelems, reductions_per_wi, reduction_groups,
11531150
in_out_iter_indexer, reduction_indexer, {dependent_ev});
11541151

1152+
// transfer ownership of USM allocation to host_task
11551153
sycl::event cleanup_host_task_event =
1156-
exec_q.submit([&](sycl::handler &cgh) {
1157-
cgh.depends_on(final_reduction_ev);
1158-
const sycl::context &ctx = exec_q.get_context();
1159-
1160-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
1161-
cgh.host_task([ctx, partially_reduced_tmp] {
1162-
sycl_free_noexcept(partially_reduced_tmp, ctx);
1163-
});
1164-
});
1154+
dpctl::tensor::alloc_utils::async_smart_free(
1155+
exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
11651156

11661157
return cleanup_host_task_event;
11671158
}
@@ -1282,18 +1273,15 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
12821273
(reduction_groups + preferred_reductions_per_wi * wg - 1) /
12831274
(preferred_reductions_per_wi * wg);
12841275

1285-
resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
1286-
batches * (reduction_groups + second_iter_reduction_groups_),
1287-
exec_q);
1288-
resTy *partially_reduced_tmp2 = nullptr;
1289-
1290-
if (partially_reduced_tmp == nullptr) {
1291-
throw std::runtime_error("Unable to allocate device_memory");
1292-
}
1293-
else {
1294-
partially_reduced_tmp2 =
1295-
partially_reduced_tmp + reduction_groups * batches;
1296-
}
1276+
// unique_ptr that owns temporary allocation for partial reductions
1277+
auto partially_reduced_tmp_owner =
1278+
dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
1279+
batches * (reduction_groups + second_iter_reduction_groups_),
1280+
exec_q);
1281+
// get raw pointers
1282+
resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
1283+
resTy *partially_reduced_tmp2 =
1284+
partially_reduced_tmp + reduction_groups * batches;
12971285

12981286
sycl::event first_reduction_ev;
12991287
{
@@ -1401,15 +1389,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
14011389
in_out_iter_indexer, reduction_indexer, {dependent_ev});
14021390

14031391
sycl::event cleanup_host_task_event =
1404-
exec_q.submit([&](sycl::handler &cgh) {
1405-
cgh.depends_on(final_reduction_ev);
1406-
const sycl::context &ctx = exec_q.get_context();
1407-
1408-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
1409-
cgh.host_task([ctx, partially_reduced_tmp] {
1410-
sycl_free_noexcept(partially_reduced_tmp, ctx);
1411-
});
1412-
});
1392+
dpctl::tensor::alloc_utils::async_smart_free(
1393+
exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
14131394

14141395
return cleanup_host_task_event;
14151396
}

0 commit comments

Comments
 (0)