IntelPython
diff --git a/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
Lines changed: 14 additions & 23 deletions b/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
Lines changed: 14 additions & 23 deletions
diff --git a/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
Lines changed: 6 additions & 11 deletions b/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
Lines changed: 6 additions & 11 deletions
diff --git a/‎dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
Lines changed: 22 additions & 41 deletions b/‎dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
Lines changed: 22 additions & 41 deletions
@@ -912,11 +912,11 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
         *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
 
     std::size_t n1_padded = n1 + max_sgSize;
-    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
 
-    if (padded_vec == nullptr) {
-        throw std::runtime_error("Could not allocate memory on the device");
-    }
     sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends); // ensure vec contains actual data
         cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
@@ -948,13 +948,9 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
                 mat, padded_vec, res, n_elems, n1));
     });
 
-    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        const sycl::context &ctx = exec_q.get_context();
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task(
-            [ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); });
-    });
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
     host_tasks.push_back(tmp_cleanup_ev);
 
     return comp_ev;
@@ -992,11 +988,10 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
         *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
 
     std::size_t n1_padded = n1 + max_sgSize;
-    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
-
-    if (padded_vec == nullptr) {
-        throw std::runtime_error("Could not allocate memory on the device");
-    }
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
 
     sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends); // ensure vec contains actual data
@@ -1029,13 +1024,9 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
                 padded_vec, mat, res, n_elems, n1));
     });
 
-    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        const sycl::context &ctx = exec_q.get_context();
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task(
-            [ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); });
-    });
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
     host_tasks.push_back(tmp_cleanup_ev);
 
     return comp_ev;
 
@@ -423,11 +423,11 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
         *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
 
     std::size_t n1_padded = n1 + max_sgSize;
-    argT *padded_vec = sycl::malloc_device<argT>(n1_padded, exec_q);
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT>(n1_padded,
+                                                              exec_q);
+    argT *padded_vec = padded_vec_owner.get();
 
-    if (padded_vec == nullptr) {
-        throw std::runtime_error("Could not allocate memory on the device");
-    }
     sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends); // ensure vec contains actual data
         cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
@@ -459,13 +459,8 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
                                                                 n_elems, n1));
     });
 
-    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        const sycl::context &ctx = exec_q.get_context();
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task(
-            [ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); });
-    });
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
     host_tasks.push_back(tmp_cleanup_ev);
 
     return comp_ev;
 
@@ -1026,18 +1026,15 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
             (reduction_groups + preferred_reductions_per_wi * wg - 1) /
             (preferred_reductions_per_wi * wg);
 
-        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
-            batches * (reduction_groups + second_iter_reduction_groups_),
-            exec_q);
-        resTy *partially_reduced_tmp2 = nullptr;
+        // returns unique_ptr
+        auto partially_reduced_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                batches * (reduction_groups + second_iter_reduction_groups_),
+                exec_q);
 
-        if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unable to allocate device_memory");
-        }
-        else {
-            partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_groups * batches;
-        }
+        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * batches;
 
         sycl::event first_reduction_ev;
         {
@@ -1152,16 +1149,10 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q,
                 remaining_reduction_nelems, reductions_per_wi, reduction_groups,
                 in_out_iter_indexer, reduction_indexer, {dependent_ev});
 
+        // transfer ownership of USM allocation to host_task
         sycl::event cleanup_host_task_event =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(final_reduction_ev);
-                const sycl::context &ctx = exec_q.get_context();
-
-                using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-                cgh.host_task([ctx, partially_reduced_tmp] {
-                    sycl_free_noexcept(partially_reduced_tmp, ctx);
-                });
-            });
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
 
         return cleanup_host_task_event;
     }
@@ -1282,18 +1273,15 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
             (reduction_groups + preferred_reductions_per_wi * wg - 1) /
             (preferred_reductions_per_wi * wg);
 
-        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
-            batches * (reduction_groups + second_iter_reduction_groups_),
-            exec_q);
-        resTy *partially_reduced_tmp2 = nullptr;
-
-        if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unable to allocate device_memory");
-        }
-        else {
-            partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_groups * batches;
-        }
+        // unique_ptr that owns temporary allocation for partial reductions
+        auto partially_reduced_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                batches * (reduction_groups + second_iter_reduction_groups_),
+                exec_q);
+        // get raw pointers
+        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * batches;
 
         sycl::event first_reduction_ev;
         {
@@ -1401,15 +1389,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q,
                 in_out_iter_indexer, reduction_indexer, {dependent_ev});
 
         sycl::event cleanup_host_task_event =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(final_reduction_ev);
-                const sycl::context &ctx = exec_q.get_context();
-
-                using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-                cgh.host_task([ctx, partially_reduced_tmp] {
-                    sycl_free_noexcept(partially_reduced_tmp, ctx);
-                });
-            });
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
 
         return cleanup_host_task_event;
     }