From 620bdd0068b20e8fc2191bf0078dbb915c474216 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:02:43 -0600 Subject: [PATCH 01/11] Replace use of anonymous namespace in kernels/elementwise_functions headers Using anonymous namespace in header files is against best C++ practices, since entities in anonymous namespace have internal linkage, and every translation unit that includes the header file would have its own copy, increasing compilation time and bloating the binary size. --- .../kernels/elementwise_functions/abs.hpp | 9 ++++---- .../kernels/elementwise_functions/acos.hpp | 9 ++++---- .../kernels/elementwise_functions/acosh.hpp | 9 ++++---- .../kernels/elementwise_functions/add.hpp | 15 +++++++----- .../kernels/elementwise_functions/angle.hpp | 9 ++++---- .../kernels/elementwise_functions/asin.hpp | 9 ++++---- .../kernels/elementwise_functions/asinh.hpp | 9 ++++---- .../kernels/elementwise_functions/atan.hpp | 9 ++++---- .../kernels/elementwise_functions/atan2.hpp | 12 +++++----- .../kernels/elementwise_functions/atanh.hpp | 9 ++++---- .../elementwise_functions/bitwise_and.hpp | 20 ++++++++-------- .../elementwise_functions/bitwise_invert.hpp | 12 +++++----- .../bitwise_left_shift.hpp | 22 ++++++++++-------- .../elementwise_functions/bitwise_or.hpp | 21 +++++++++-------- .../bitwise_right_shift.hpp | 23 +++++++++++-------- .../elementwise_functions/bitwise_xor.hpp | 21 +++++++++-------- .../kernels/elementwise_functions/cbrt.hpp | 9 ++++---- .../kernels/elementwise_functions/ceil.hpp | 9 ++++---- .../kernels/elementwise_functions/conj.hpp | 9 ++++---- .../elementwise_functions/copysign.hpp | 12 +++++----- .../kernels/elementwise_functions/cos.hpp | 9 ++++---- .../kernels/elementwise_functions/cosh.hpp | 9 ++++---- .../kernels/elementwise_functions/equal.hpp | 12 +++++----- .../kernels/elementwise_functions/exp.hpp | 9 ++++---- .../kernels/elementwise_functions/exp2.hpp | 10 ++++---- .../kernels/elementwise_functions/expm1.hpp | 9 ++++---- .../kernels/elementwise_functions/floor.hpp | 9 ++++---- .../elementwise_functions/floor_divide.hpp | 21 +++++++++-------- .../kernels/elementwise_functions/greater.hpp | 13 ++++++----- .../elementwise_functions/greater_equal.hpp | 12 +++++----- .../kernels/elementwise_functions/hypot.hpp | 12 +++++----- .../kernels/elementwise_functions/imag.hpp | 9 ++++---- .../elementwise_functions/isfinite.hpp | 12 +++++----- .../kernels/elementwise_functions/isinf.hpp | 9 ++++---- .../kernels/elementwise_functions/isnan.hpp | 9 ++++---- .../kernels/elementwise_functions/less.hpp | 12 +++++----- .../elementwise_functions/less_equal.hpp | 12 +++++----- .../kernels/elementwise_functions/log.hpp | 9 ++++---- .../kernels/elementwise_functions/log10.hpp | 9 ++++---- .../kernels/elementwise_functions/log1p.hpp | 9 ++++---- .../kernels/elementwise_functions/log2.hpp | 9 ++++---- .../elementwise_functions/logaddexp.hpp | 12 +++++----- .../elementwise_functions/logical_and.hpp | 12 +++++----- .../elementwise_functions/logical_not.hpp | 12 +++++----- .../elementwise_functions/logical_or.hpp | 12 +++++----- .../elementwise_functions/logical_xor.hpp | 12 +++++----- .../kernels/elementwise_functions/maximum.hpp | 12 +++++----- .../kernels/elementwise_functions/minimum.hpp | 12 +++++----- .../elementwise_functions/multiply.hpp | 20 ++++++++-------- .../elementwise_functions/negative.hpp | 11 ++++----- .../elementwise_functions/nextafter.hpp | 12 +++++----- .../elementwise_functions/not_equal.hpp | 12 +++++----- .../elementwise_functions/positive.hpp | 11 ++++----- .../kernels/elementwise_functions/pow.hpp | 18 +++++++-------- .../kernels/elementwise_functions/proj.hpp | 9 ++++---- .../kernels/elementwise_functions/real.hpp | 9 ++++---- .../elementwise_functions/reciprocal.hpp | 11 ++++----- .../elementwise_functions/remainder.hpp | 20 ++++++++-------- .../kernels/elementwise_functions/round.hpp | 9 ++++---- .../kernels/elementwise_functions/rsqrt.hpp | 9 ++++---- .../kernels/elementwise_functions/sign.hpp | 9 ++++---- .../kernels/elementwise_functions/signbit.hpp | 11 ++++----- .../kernels/elementwise_functions/sin.hpp | 9 ++++---- .../kernels/elementwise_functions/sinh.hpp | 9 ++++---- .../kernels/elementwise_functions/sqrt.hpp | 9 ++++---- .../kernels/elementwise_functions/square.hpp | 11 ++++----- .../elementwise_functions/subtract.hpp | 20 ++++++++-------- .../kernels/elementwise_functions/tan.hpp | 9 ++++---- .../kernels/elementwise_functions/tanh.hpp | 9 ++++---- .../elementwise_functions/true_divide.hpp | 20 ++++++++-------- .../kernels/elementwise_functions/trunc.hpp | 9 ++++---- 71 files changed, 441 insertions(+), 400 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp index af3c014ec9..48c8e3e4dd 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -125,7 +125,7 @@ template struct AbsOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -141,7 +141,7 @@ template struct AbsContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // namespace +} // namespace hyperparam_detail template class abs_contig_kernel; @@ -153,8 +153,9 @@ sycl::event abs_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AbsContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AbsContigHyperparameterSet::n_vecs; + using AbsHS = hyperparam_detail::AbsContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AbsHS::vec_sz; + constexpr std::uint8_t n_vec = AbsHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp index e0b9e57339..7dbfb6618c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp @@ -159,7 +159,7 @@ template struct AcosOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -175,7 +175,7 @@ template struct AcosContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // namespace +} // end of namespace hyperparam_detail template class acos_contig_kernel; @@ -187,8 +187,9 @@ sycl::event acos_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AcosContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AcosContigHyperparameterSet::n_vecs; + using AcosHS = hyperparam_detail::AcosContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AcosHS::vec_sz; + constexpr std::uint8_t n_vec = AcosHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp index b0e33e0ac5..a81ff3da99 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp @@ -186,7 +186,7 @@ template struct AcoshOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -203,7 +203,7 @@ template struct AcoshContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // namespace +} // end of namespace hyperparam_detail template class acosh_contig_kernel; @@ -215,8 +215,9 @@ sycl::event acosh_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AcoshContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AcoshContigHyperparameterSet::n_vecs; + using AcoshHS = hyperparam_detail::AcoshContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AcoshHS::vec_sz; + constexpr std::uint8_t n_vec = AcoshHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index 157234bb1f..476e7b52b9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -199,7 +199,7 @@ template struct AddOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -252,7 +252,7 @@ template struct AddContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr auto vec_sz = AddContigHyperparameterSet::vec_sz; - constexpr auto n_vecs = AddContigHyperparameterSet::n_vecs; + using AddHS = hyperparam_detail::AddContigHyperparameterSet; + constexpr auto vec_sz = AddHS::vec_sz; + constexpr auto n_vecs = AddHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel, @@ -550,8 +551,10 @@ add_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr auto vec_sz = AddContigHyperparameterSet::vec_sz; - constexpr auto n_vecs = AddContigHyperparameterSet::n_vecs; + constexpr auto vec_sz = + hyperparam_detail::AddContigHyperparameterSet::vec_sz; + constexpr auto n_vecs = + hyperparam_detail::AddContigHyperparameterSet::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index 546012d86e..726f90ba81 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -102,7 +102,7 @@ template struct AngleOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -119,7 +119,7 @@ template struct AngleContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class angle_contig_kernel; @@ -131,8 +131,9 @@ sycl::event angle_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AngleContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AngleContigHyperparameterSet::n_vecs; + using AngleHS = hyperparam_detail::AngleContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AngleHS::vec_sz; + constexpr std::uint8_t n_vec = AngleHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp index 804fb58a44..70b48895b4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp @@ -179,7 +179,7 @@ template struct AsinOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -196,7 +196,7 @@ template struct AsinContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class asin_contig_kernel; @@ -208,8 +208,9 @@ sycl::event asin_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AsinContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AsinContigHyperparameterSet::n_vecs; + using AddHS = hyperparam_detail::AsinContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AddHS::vec_sz; + constexpr std::uint8_t n_vec = AddHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp index 78d72b7ece..420ba3246c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp @@ -162,7 +162,7 @@ template struct AsinhOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -179,7 +179,7 @@ template struct AsinhContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class asinh_contig_kernel; @@ -191,8 +191,9 @@ sycl::event asinh_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AsinhContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AsinhContigHyperparameterSet::n_vecs; + using AsinhHS = hyperparam_detail::AsinhContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AsinhHS::vec_sz; + constexpr std::uint8_t n_vec = AsinhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp index 7e0e1b9b18..29c4941d76 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp @@ -172,7 +172,7 @@ template struct AtanOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -189,7 +189,7 @@ template struct AtanContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class atan_contig_kernel; @@ -201,8 +201,9 @@ sycl::event atan_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AtanContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AtanContigHyperparameterSet::n_vecs; + using AtanHS = hyperparam_detail::AtanContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AtanHS::vec_sz; + constexpr std::uint8_t n_vec = AtanHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp index 42503e98a6..32384fc8a9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp @@ -106,7 +106,7 @@ template struct Atan2OutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -123,7 +123,7 @@ template struct Atan2ContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - Atan2ContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - Atan2ContigHyperparameterSet::n_vecs; + using Atan2HS = + hyperparam_detail::Atan2ContigHyperparameterSet; + constexpr std::uint8_t vec_sz = Atan2HS::vec_sz; + constexpr std::uint8_t n_vecs = Atan2HS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp index eb23ded724..39f11e0f90 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp @@ -163,7 +163,7 @@ template struct AtanhOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -180,7 +180,7 @@ template struct AtanhContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class atanh_contig_kernel; @@ -192,8 +192,9 @@ sycl::event atanh_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = AtanhContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = AtanhContigHyperparameterSet::n_vecs; + using AtanhHS = hyperparam_detail::AtanhContigHyperparameterSet; + constexpr std::uint8_t vec_sz = AtanhHS::vec_sz; + constexpr std::uint8_t n_vec = AtanhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index 310de9a9fb..cdea67f080 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -163,7 +163,7 @@ template struct BitwiseAndOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -180,7 +180,7 @@ struct BitwiseAndContigHyperparameterSet constexpr static auto vec_sz = value_type::vec_sz; constexpr static auto n_vecs = value_type::n_vecs; }; -} // namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseAndContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = - BitwiseAndContigHyperparameterSet::n_vecs; + using BitwiseAndHS = + hyperparam_detail::BitwiseAndContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; + constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor, @@ -389,10 +389,10 @@ bitwise_and_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseAndContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseAndContigHyperparameterSet::n_vecs; + using BitwiseAndHS = + hyperparam_detail::BitwiseAndContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseAndInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index 52cd0ddcb8..d96bf35afe 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -118,7 +118,7 @@ template struct BitwiseInvertOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -135,7 +135,7 @@ template struct BitwiseInvertContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class bitwise_invert_contig_kernel; @@ -148,10 +148,10 @@ bitwise_invert_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseInvertContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vec = - BitwiseInvertContigHyperparameterSet::n_vecs; + using BitwiseInvertHS = + hyperparam_detail::BitwiseInvertContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz; + constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index b735025c6b..3ff2d5e315 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -172,7 +172,7 @@ template struct BitwiseLeftShiftOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -190,7 +190,7 @@ struct BitwiseLeftShiftContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseLeftShiftContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseLeftShiftContigHyperparameterSet::n_vecs; + using BitwiseLSHS = + hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseLeftShiftOutputType, @@ -403,10 +404,11 @@ sycl::event bitwise_left_shift_inplace_contig_impl( ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseLeftShiftContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseLeftShiftContigHyperparameterSet::n_vecs; + using BitwiseLSHS = + hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseLeftShiftInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index df062fe58a..f1cbb63afc 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -162,7 +162,7 @@ template struct BitwiseOrOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -180,7 +180,7 @@ struct BitwiseOrContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseOrContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseOrContigHyperparameterSet::n_vecs; + using BitwiseOrHS = + hyperparam_detail::BitwiseOrContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor, @@ -384,10 +384,11 @@ bitwise_or_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseOrContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseOrContigHyperparameterSet::n_vecs; + using BitwiseOrHS = + hyperparam_detail::BitwiseOrContigHyperparameterSet; + + constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseOrInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index 6e5df0411a..580ee7d828 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -174,7 +174,7 @@ template struct BitwiseRightShiftOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -192,7 +192,7 @@ struct BitwiseRightShiftContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseRightShiftContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseRightShiftContigHyperparameterSet::n_vecs; + using BitwiseRSHS = + hyperparam_detail::BitwiseRightShiftContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseRightShiftOutputType, @@ -407,11 +408,13 @@ sycl::event bitwise_right_shift_inplace_contig_impl( ssize_t res_offset, const std::vector &depends = {}) { + using BitwiseRSHS = + hyperparam_detail::BitwiseRightShiftContigHyperparameterSet; + // res = OP(res, arg) - constexpr std::uint8_t vec_sz = - BitwiseRightShiftContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseRightShiftContigHyperparameterSet::n_vecs; + constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseRightShiftInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index 29032b016b..84d1c8ff64 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -163,7 +163,7 @@ template struct BitwiseXorOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -181,7 +181,7 @@ struct BitwiseXorContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseXorContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseXorContigHyperparameterSet::n_vecs; + using BitwiseXorHS = + hyperparam_detail::BitwiseXorContigHyperparameterSet; + constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor, @@ -390,10 +390,11 @@ bitwise_xor_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - BitwiseXorContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - BitwiseXorContigHyperparameterSet::n_vecs; + using BitwiseXorHS = + hyperparam_detail::BitwiseXorContigHyperparameterSet; + + constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz; + constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, BitwiseXorInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp index 73261b1cee..d857e1563c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -94,7 +94,7 @@ template struct CbrtOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -111,7 +111,7 @@ template struct CbrtContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class cbrt_contig_kernel; @@ -123,8 +123,9 @@ sycl::event cbrt_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = CbrtContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = CbrtContigHyperparameterSet::n_vecs; + using CbrtHS = hyperparam_detail::CbrtContigHyperparameterSet; + constexpr std::uint8_t vec_sz = CbrtHS::vec_sz; + constexpr std::uint8_t n_vecs = CbrtHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp index e3fae8287c..1328df3f4b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp @@ -115,7 +115,7 @@ template struct CeilOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -132,7 +132,7 @@ template struct CeilContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class ceil_contig_kernel; @@ -144,8 +144,9 @@ sycl::event ceil_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = CeilContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = CeilContigHyperparameterSet::n_vecs; + using CeilHS = hyperparam_detail::CeilContigHyperparameterSet; + constexpr std::uint8_t vec_sz = CeilHS::vec_sz; + constexpr std::uint8_t n_vecs = CeilHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index 0fdd7e995f..19a95df5a1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -122,7 +122,7 @@ template struct ConjOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -139,7 +139,7 @@ template struct ConjContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class conj_contig_kernel; @@ -151,8 +151,9 @@ sycl::event conj_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = ConjContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = ConjContigHyperparameterSet::n_vecs; + using ConjHS = hyperparam_detail::ConjContigHyperparameterSet; + constexpr std::uint8_t vec_sz = ConjHS::vec_sz; + constexpr std::uint8_t n_vecs = ConjHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp index 4cbcfdaa81..ecfbf99c28 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -120,7 +120,7 @@ template struct CopysignOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -138,7 +138,7 @@ struct CopysignContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - CopysignContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - CopysignContigHyperparameterSet::n_vecs; + using CopySignHS = + hyperparam_detail::CopysignContigHyperparameterSet; + constexpr std::uint8_t vec_sz = CopySignHS::vec_sz; + constexpr std::uint8_t n_vecs = CopySignHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, CopysignOutputType, CopysignContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp index f80a4850d5..5940315c62 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -195,7 +195,7 @@ template struct CosOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -212,7 +212,7 @@ template struct CosContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class cos_contig_kernel; @@ -224,8 +224,9 @@ sycl::event cos_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = CosContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = CosContigHyperparameterSet::n_vecs; + using CosHS = hyperparam_detail::CosContigHyperparameterSet; + constexpr std::uint8_t vec_sz = CosHS::vec_sz; + constexpr std::uint8_t n_vecs = CosHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp index 1c5887e1ef..59468428d1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp @@ -184,7 +184,7 @@ template struct CoshOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -201,7 +201,7 @@ template struct CoshContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // namespace hyperparam_detail template class cosh_contig_kernel; @@ -213,8 +213,9 @@ sycl::event cosh_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = CoshContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = CoshContigHyperparameterSet::n_vecs; + using CoshHS = hyperparam_detail::CoshContigHyperparameterSet; + constexpr std::uint8_t vec_sz = CoshHS::vec_sz; + constexpr std::uint8_t n_vecs = CoshHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp index c3e5ea04a8..a53f6412de 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -192,7 +192,7 @@ template struct EqualOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -209,7 +209,7 @@ template struct EqualContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - EqualContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - EqualContigHyperparameterSet::n_vecs; + using EqualHS = + hyperparam_detail::EqualContigHyperparameterSet; + constexpr std::uint8_t vec_sz = EqualHS::vec_sz; + constexpr std::uint8_t n_vecs = EqualHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, EqualOutputType, EqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp index 6b8756d1bf..00f8213251 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp @@ -153,7 +153,7 @@ template struct ExpOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -170,7 +170,7 @@ template struct ExpContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class exp_contig_kernel; @@ -182,8 +182,9 @@ sycl::event exp_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = ExpContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = ExpContigHyperparameterSet::n_vecs; + using ExpHS = hyperparam_detail::ExpContigHyperparameterSet; + constexpr std::uint8_t vec_sz = ExpHS::vec_sz; + constexpr std::uint8_t n_vecs = ExpHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp index c88e927525..22291101ca 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -155,7 +155,7 @@ template struct Exp2OutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -172,7 +172,7 @@ template struct Exp2ContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class exp2_contig_kernel; @@ -184,8 +184,10 @@ sycl::event exp2_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = Exp2ContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = Exp2ContigHyperparameterSet::n_vecs; + using Exp2HS = hyperparam_detail::Exp2ContigHyperparameterSet; + + constexpr std::uint8_t vec_sz = Exp2HS::vec_sz; + constexpr std::uint8_t n_vecs = Exp2HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp index be9aeb42c3..d1d64f4904 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -168,7 +168,7 @@ template struct Expm1OutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -185,7 +185,7 @@ template struct Expm1ContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class expm1_contig_kernel; @@ -197,8 +197,9 @@ sycl::event expm1_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = Expm1ContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = Expm1ContigHyperparameterSet::n_vecs; + using Expm1HS = hyperparam_detail::Expm1ContigHyperparameterSet; + constexpr std::uint8_t vec_sz = Expm1HS::vec_sz; + constexpr std::uint8_t n_vecs = Expm1HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp index cd0f4f698c..aaa81b77b9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp @@ -115,7 +115,7 @@ template struct FloorOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -132,7 +132,7 @@ template struct FloorContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class floor_contig_kernel; @@ -144,8 +144,9 @@ sycl::event floor_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = FloorContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = FloorContigHyperparameterSet::n_vecs; + using FloorHS = hyperparam_detail::FloorContigHyperparameterSet; + constexpr std::uint8_t vec_sz = FloorHS::vec_sz; + constexpr std::uint8_t n_vecs = FloorHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index d574ef97e3..d290aa66a6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -204,7 +204,7 @@ template struct FloorDivideOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -222,7 +222,7 @@ struct FloorDivideContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - FloorDivideContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - FloorDivideContigHyperparameterSet::n_vecs; + using FloorDivideHS = + hyperparam_detail::FloorDivideContigHyperparameterSet; + constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; + constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor, @@ -469,10 +469,11 @@ floor_divide_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - FloorDivideContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - FloorDivideContigHyperparameterSet::n_vecs; + using FloorDivideHS = + hyperparam_detail::FloorDivideContigHyperparameterSet; + + constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz; + constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, FloorDivideInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp index 42469c77b4..4d0b7fb94f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -193,7 +193,7 @@ template struct GreaterOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -211,7 +211,7 @@ struct GreaterContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - GreaterContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - GreaterContigHyperparameterSet::n_vecs; + using GreaterHS = + hyperparam_detail::GreaterContigHyperparameterSet; + + constexpr std::uint8_t vec_sz = GreaterHS::vec_sz; + constexpr std::uint8_t n_vecs = GreaterHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, GreaterOutputType, GreaterContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp index 0079f9b1ba..b149158ee0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp @@ -194,7 +194,7 @@ template struct GreaterEqualOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -212,7 +212,7 @@ struct GreaterEqualContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - GreaterEqualContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - GreaterEqualContigHyperparameterSet::n_vecs; + using GreaterEqHS = + hyperparam_detail::GreaterEqualContigHyperparameterSet; + constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz; + constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp index 70f0667dd5..faa92c2f2d 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -122,7 +122,7 @@ template struct HypotOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -139,7 +139,7 @@ template struct HypotContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - HypotContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - HypotContigHyperparameterSet::n_vecs; + using HypotHS = + hyperparam_detail::HypotContigHyperparameterSet; + constexpr std::uint8_t vec_sz = HypotHS::vec_sz; + constexpr std::uint8_t n_vecs = HypotHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, HypotOutputType, HypotContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp index 2435356e95..89adabff41 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp @@ -118,7 +118,7 @@ template struct ImagOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -135,7 +135,7 @@ template struct ImagContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class imag_contig_kernel; @@ -147,8 +147,9 @@ sycl::event imag_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = ImagContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = ImagContigHyperparameterSet::n_vecs; + using ImagHS = hyperparam_detail::ImagContigHyperparameterSet; + constexpr std::uint8_t vec_sz = ImagHS::vec_sz; + constexpr std::uint8_t n_vecs = ImagHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp index 7f2d89d4e5..b0651a4d8b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -120,7 +120,7 @@ template struct IsFiniteOutputType using value_type = bool; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -137,7 +137,7 @@ template struct IsFiniteContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class isfinite_contig_kernel; @@ -149,10 +149,10 @@ sycl::event isfinite_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - IsFiniteContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - IsFiniteContigHyperparameterSet::n_vecs; + using IsFiniteHS = + hyperparam_detail::IsFiniteContigHyperparameterSet; + constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz; + constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, IsFiniteOutputType, IsFiniteContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp index 9db84c2f40..ec78746143 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -120,7 +120,7 @@ template struct IsInfOutputType using value_type = bool; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -137,7 +137,7 @@ template struct IsInfContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // namespace hyperparam_detail template class isinf_contig_kernel; @@ -149,8 +149,9 @@ sycl::event isinf_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = IsInfContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = IsInfContigHyperparameterSet::n_vecs; + using IsInfHS = hyperparam_detail::IsInfContigHyperparameterSet; + constexpr std::uint8_t vec_sz = IsInfHS::vec_sz; + constexpr std::uint8_t n_vecs = IsInfHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp index 8dc3508bce..fbf6ef9383 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -118,7 +118,7 @@ template struct IsNanOutputType using value_type = bool; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -135,7 +135,7 @@ template struct IsNanContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class isnan_contig_kernel; @@ -147,8 +147,9 @@ sycl::event isnan_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = IsNanContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = IsNanContigHyperparameterSet::n_vecs; + using IsNanHS = hyperparam_detail::IsNanContigHyperparameterSet; + constexpr std::uint8_t vec_sz = IsNanHS::vec_sz; + constexpr std::uint8_t n_vecs = IsNanHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp index 6a56b292fd..523410a161 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp @@ -191,7 +191,7 @@ template struct LessOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -208,7 +208,7 @@ template struct LessContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - LessContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - LessContigHyperparameterSet::n_vecs; + using LessHS = + hyperparam_detail::LessContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LessHS::vec_sz; + constexpr std::uint8_t n_vecs = LessHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp index 0125440fed..5827d350a3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp @@ -192,7 +192,7 @@ template struct LessEqualOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -210,7 +210,7 @@ struct LessEqualContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - LessEqualContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - LessEqualContigHyperparameterSet::n_vecs; + using LessEqHS = + hyperparam_detail::LessEqualContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LessEqHS::vec_sz; + constexpr std::uint8_t n_vecs = LessEqHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp index 92be7b3b6c..84471a5ef4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -110,7 +110,7 @@ template struct LogOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -127,7 +127,7 @@ template struct LogContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class log_contig_kernel; @@ -139,8 +139,9 @@ sycl::event log_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = LogContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = LogContigHyperparameterSet::n_vecs; + using LogHS = hyperparam_detail::LogContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LogHS::vec_sz; + constexpr std::uint8_t n_vecs = LogHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp index 61e9c2a48f..d308c85ac9 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp @@ -129,7 +129,7 @@ template struct Log10OutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -146,7 +146,7 @@ template struct Log10ContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class log10_contig_kernel; @@ -158,8 +158,9 @@ sycl::event log10_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = Log10ContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = Log10ContigHyperparameterSet::n_vecs; + using Log10HS = hyperparam_detail::Log10ContigHyperparameterSet; + constexpr std::uint8_t vec_sz = Log10HS::vec_sz; + constexpr std::uint8_t n_vecs = Log10HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp index 70afe26c82..b8d993dd94 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp @@ -134,7 +134,7 @@ template struct Log1pOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -151,7 +151,7 @@ template struct Log1pContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class log1p_contig_kernel; @@ -163,8 +163,9 @@ sycl::event log1p_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = Log1pContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = Log1pContigHyperparameterSet::n_vecs; + using Log1pHS = hyperparam_detail::Log1pContigHyperparameterSet; + constexpr std::uint8_t vec_sz = Log1pHS::vec_sz; + constexpr std::uint8_t n_vecs = Log1pHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp index 0c399b54c1..42c837cfa3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp @@ -130,7 +130,7 @@ template struct Log2OutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -147,7 +147,7 @@ template struct Log2ContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class log2_contig_kernel; @@ -159,8 +159,9 @@ sycl::event log2_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = Log2ContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = Log2ContigHyperparameterSet::n_vecs; + using Log2HS = hyperparam_detail::Log2ContigHyperparameterSet; + constexpr std::uint8_t vec_sz = Log2HS::vec_sz; + constexpr std::uint8_t n_vecs = Log2HS::n_vecs; return elementwise_common::unary_contig_impl< argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index 56de4b62e1..0076a0bbe3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -137,7 +137,7 @@ template struct LogAddExpOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -155,7 +155,7 @@ struct LogAddExpContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - LogAddExpContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - LogAddExpContigHyperparameterSet::n_vecs; + using LogAddExpHS = + hyperparam_detail::LogAddExpContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz; + constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp index 257f1afd17..0aa1f61b90 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -162,7 +162,7 @@ template struct LogicalAndOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -180,7 +180,7 @@ struct LogicalAndContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - LogicalAndContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - LogicalAndContigHyperparameterSet::n_vecs; + using LogicalAndHS = + hyperparam_detail::LogicalAndContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz; + constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp index ab8d1256d0..e6d3e6d5ad 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -93,7 +93,7 @@ template struct LogicalNotOutputType using value_type = bool; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -110,7 +110,7 @@ template struct LogicalNotContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class logical_not_contig_kernel; @@ -123,10 +123,10 @@ logical_not_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - LogicalNotContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - LogicalNotContigHyperparameterSet::n_vecs; + using LogicalNotHS = + hyperparam_detail::LogicalNotContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz; + constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, LogicalNotOutputType, LogicalNotContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp index 1cd499f7f4..1fdcd84f60 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp @@ -161,7 +161,7 @@ template struct LogicalOrOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -179,7 +179,7 @@ struct LogicalOrContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - LogicalOrContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - LogicalOrContigHyperparameterSet::n_vecs; + using LogicalOrHS = + hyperparam_detail::LogicalOrContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz; + constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp index f7962181d3..0ef3b17dff 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -163,7 +163,7 @@ template struct LogicalXorOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -181,7 +181,7 @@ struct LogicalXorContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - LogicalXorContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - LogicalXorContigHyperparameterSet::n_vecs; + using LogicalXorHS = + hyperparam_detail::LogicalXorContigHyperparameterSet; + constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz; + constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp index 192dfb0d4d..799cbb1d8c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -195,7 +195,7 @@ template struct MaximumOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -213,7 +213,7 @@ struct MaximumContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - MaximumContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - MaximumContigHyperparameterSet::n_vecs; + using MaxHS = + hyperparam_detail::MaximumContigHyperparameterSet; + constexpr std::uint8_t vec_sz = MaxHS::vec_sz; + constexpr std::uint8_t n_vecs = MaxHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, MaximumOutputType, MaximumContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp index cc40f92800..9a672e539f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp @@ -195,7 +195,7 @@ template struct MinimumOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -213,7 +213,7 @@ struct MinimumContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - MinimumContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - MinimumContigHyperparameterSet::n_vecs; + using MinHS = + hyperparam_detail::MinimumContigHyperparameterSet; + constexpr std::uint8_t vec_sz = MinHS::vec_sz; + constexpr std::uint8_t n_vecs = MinHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, MinimumOutputType, MinimumContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp index 3b31800453..ca24383b44 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -187,7 +187,7 @@ template struct MultiplyOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -205,7 +205,7 @@ struct MultiplyContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - MultiplyContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - MultiplyContigHyperparameterSet::n_vecs; + using MulHS = + hyperparam_detail::MultiplyContigHyperparameterSet; + constexpr std::uint8_t vec_sz = MulHS::vec_sz; + constexpr std::uint8_t n_vecs = MulHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor, @@ -511,10 +511,10 @@ multiply_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - MultiplyContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - MultiplyContigHyperparameterSet::n_vecs; + using MulHS = + hyperparam_detail::MultiplyContigHyperparameterSet; + constexpr std::uint8_t vec_sz = MulHS::vec_sz; + constexpr std::uint8_t n_vecs = MulHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, MultiplyInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp index 28b9cca8f6..47707a5f04 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -100,7 +100,7 @@ template struct NegativeOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -117,7 +117,7 @@ template struct NegativeContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class negative_contig_kernel; @@ -129,10 +129,9 @@ sycl::event negative_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - NegativeContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - NegativeContigHyperparameterSet::n_vecs; + using NegHS = hyperparam_detail::NegativeContigHyperparameterSet; + constexpr std::uint8_t vec_sz = NegHS::vec_sz; + constexpr std::uint8_t n_vecs = NegHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, NegativeOutputType, NegativeContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp index 57ca03dca7..4a54a115f3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp @@ -120,7 +120,7 @@ template struct NextafterOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -138,7 +138,7 @@ struct NextafterContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - NextafterContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - NextafterContigHyperparameterSet::n_vecs; + using NextafterHS = + hyperparam_detail::NextafterContigHyperparameterSet; + constexpr std::uint8_t vec_sz = NextafterHS::vec_sz; + constexpr std::uint8_t n_vecs = NextafterHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, NextafterOutputType, NextafterContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp index 1e597a186e..f21bc678fd 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -176,7 +176,7 @@ template struct NotEqualOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -194,7 +194,7 @@ struct NotEqualContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - NotEqualContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - NotEqualContigHyperparameterSet::n_vecs; + using NotEqHS = + hyperparam_detail::NotEqualContigHyperparameterSet; + constexpr std::uint8_t vec_sz = NotEqHS::vec_sz; + constexpr std::uint8_t n_vecs = NotEqHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp index c30beaf2e2..df6c04021f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp @@ -115,7 +115,7 @@ template struct PositiveOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -132,7 +132,7 @@ template struct PositiveContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class positive_contig_kernel; @@ -144,10 +144,9 @@ sycl::event positive_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - PositiveContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - PositiveContigHyperparameterSet::n_vecs; + using PosHS = hyperparam_detail::PositiveContigHyperparameterSet; + constexpr std::uint8_t vec_sz = PosHS::vec_sz; + constexpr std::uint8_t n_vecs = PosHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, PositiveOutputType, PositiveContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index 82ffdb520b..d7b0ed909e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -239,7 +239,7 @@ template struct PowOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -256,7 +256,7 @@ template struct PowContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - PowContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - PowContigHyperparameterSet::n_vecs; + using PowHS = hyperparam_detail::PowContigHyperparameterSet; + constexpr std::uint8_t vec_sz = PowHS::vec_sz; + constexpr std::uint8_t n_vecs = PowHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel, @@ -522,10 +521,9 @@ pow_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - PowContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - PowContigHyperparameterSet::n_vecs; + using PowHS = hyperparam_detail::PowContigHyperparameterSet; + constexpr std::uint8_t vec_sz = PowHS::vec_sz; + constexpr std::uint8_t n_vecs = PowHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp index 4bca388ac2..df5edface1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -119,7 +119,7 @@ template struct ProjOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -136,7 +136,7 @@ template struct ProjContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class proj_contig_kernel; @@ -148,8 +148,9 @@ sycl::event proj_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = ProjContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = ProjContigHyperparameterSet::n_vecs; + using ProjHS = hyperparam_detail::ProjContigHyperparameterSet; + constexpr std::uint8_t vec_sz = ProjHS::vec_sz; + constexpr std::uint8_t n_vecs = ProjHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp index e29ba2933d..bb22352907 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp @@ -118,7 +118,7 @@ template struct RealOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -135,7 +135,7 @@ template struct RealContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class real_contig_kernel; @@ -147,8 +147,9 @@ sycl::event real_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = RealContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = RealContigHyperparameterSet::n_vecs; + using RealHS = hyperparam_detail::RealContigHyperparameterSet; + constexpr std::uint8_t vec_sz = RealHS::vec_sz; + constexpr std::uint8_t n_vecs = RealHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp index 69adfb1390..0e46acba39 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp @@ -115,7 +115,7 @@ template struct ReciprocalOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -132,7 +132,7 @@ template struct ReciprocalContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class reciprocal_contig_kernel; @@ -144,10 +144,9 @@ sycl::event reciprocal_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - ReciprocalContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - ReciprocalContigHyperparameterSet::n_vecs; + using RecipHS = hyperparam_detail::ReciprocalContigHyperparameterSet; + constexpr std::uint8_t vec_sz = RecipHS::vec_sz; + constexpr std::uint8_t n_vecs = RecipHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, ReciprocalOutputType, ReciprocalContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index 3332803402..6d3baf0d1f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -222,7 +222,7 @@ template struct RemainderOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -240,7 +240,7 @@ struct RemainderContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - RemainderContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - RemainderContigHyperparameterSet::n_vecs; + using RemHS = + hyperparam_detail::RemainderContigHyperparameterSet; + constexpr std::uint8_t vec_sz = RemHS::vec_sz; + constexpr std::uint8_t n_vecs = RemHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, RemainderOutputType, RemainderContigFunctor, @@ -493,10 +493,10 @@ remainder_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - RemainderContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - RemainderContigHyperparameterSet::n_vecs; + using RemHS = + hyperparam_detail::RemainderContigHyperparameterSet; + constexpr std::uint8_t vec_sz = RemHS::vec_sz; + constexpr std::uint8_t n_vecs = RemHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, RemainderInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp index 2d7fb20a99..7fbb20ae32 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -126,7 +126,7 @@ template struct RoundOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -143,7 +143,7 @@ template struct RoundContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class round_contig_kernel; @@ -155,8 +155,9 @@ sycl::event round_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = RoundContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = RoundContigHyperparameterSet::n_vecs; + using RoundHS = hyperparam_detail::RoundContigHyperparameterSet; + constexpr std::uint8_t vec_sz = RoundHS::vec_sz; + constexpr std::uint8_t n_vecs = RoundHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp index a64528e958..0450246807 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -97,7 +97,7 @@ template struct RsqrtOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -114,7 +114,7 @@ template struct RsqrtContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // namespace hyperparam_detail template class rsqrt_contig_kernel; @@ -126,8 +126,9 @@ sycl::event rsqrt_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = RsqrtContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = RsqrtContigHyperparameterSet::n_vecs; + using RsqrtHS = hyperparam_detail::RsqrtContigHyperparameterSet; + constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz; + constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp index 2b1791f24c..baa224942f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp @@ -138,7 +138,7 @@ template struct SignOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -155,7 +155,7 @@ template struct SignContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class sign_contig_kernel; @@ -167,8 +167,9 @@ sycl::event sign_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = SignContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = SignContigHyperparameterSet::n_vecs; + using SignHS = hyperparam_detail::SignContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SignHS::vec_sz; + constexpr std::uint8_t n_vecs = SignHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp index 3ea24ba761..9020bceb7e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp @@ -104,7 +104,7 @@ template struct SignbitOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -121,7 +121,7 @@ template struct SignbitContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class signbit_contig_kernel; @@ -133,10 +133,9 @@ sycl::event signbit_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - SignbitContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - SignbitContigHyperparameterSet::n_vecs; + using SignbitHS = hyperparam_detail::SignbitContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SignbitHS::vec_sz; + constexpr std::uint8_t n_vecs = SignbitHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp index c869eefa9d..e075a90a88 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp @@ -217,7 +217,7 @@ template struct SinOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -234,7 +234,7 @@ template struct SinContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class sin_contig_kernel; @@ -246,8 +246,9 @@ sycl::event sin_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = SinContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = SinContigHyperparameterSet::n_vecs; + using SinHS = hyperparam_detail::SinContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SinHS::vec_sz; + constexpr std::uint8_t n_vecs = SinHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp index 929f7b6556..23b3588a3b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp @@ -186,7 +186,7 @@ template struct SinhOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -203,7 +203,7 @@ template struct SinhContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class sinh_contig_kernel; @@ -215,8 +215,9 @@ sycl::event sinh_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = SinhContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = SinhContigHyperparameterSet::n_vecs; + using SinhHS = hyperparam_detail::SinhContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SinhHS::vec_sz; + constexpr std::uint8_t n_vecs = SinhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index 1754a02b7f..b83ff72495 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -112,7 +112,7 @@ template struct SqrtOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -129,7 +129,7 @@ template struct SqrtContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class sqrt_contig_kernel; @@ -141,8 +141,9 @@ sycl::event sqrt_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = SqrtContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = SqrtContigHyperparameterSet::n_vecs; + using SqrtHS = hyperparam_detail::SqrtContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SqrtHS::vec_sz; + constexpr std::uint8_t n_vecs = SqrtHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp index d5d3623707..f9d9d848c0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp @@ -137,7 +137,7 @@ template struct SquareOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -154,7 +154,7 @@ template struct SquareContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class square_contig_kernel; @@ -166,10 +166,9 @@ sycl::event square_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - SquareContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - SquareContigHyperparameterSet::n_vecs; + using SquareHS = hyperparam_detail::SquareContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SquareHS::vec_sz; + constexpr std::uint8_t n_vecs = SquareHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index 3b9e105a29..51a3955142 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -173,7 +173,7 @@ template struct SubtractOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -191,7 +191,7 @@ struct SubtractContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - SubtractContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - SubtractContigHyperparameterSet::n_vecs; + using SubHS = + hyperparam_detail::SubtractContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SubHS::vec_sz; + constexpr std::uint8_t n_vecs = SubHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, SubtractOutputType, SubtractContigFunctor, @@ -509,10 +509,10 @@ subtract_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - SubtractContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - SubtractContigHyperparameterSet::n_vecs; + using SubHS = + hyperparam_detail::SubtractContigHyperparameterSet; + constexpr std::uint8_t vec_sz = SubHS::vec_sz; + constexpr std::uint8_t n_vecs = SubHS::n_vecs; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, SubtractInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp index 092c12e4f4..770518f918 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -161,7 +161,7 @@ template struct TanOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -178,7 +178,7 @@ template struct TanContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class tan_contig_kernel; @@ -190,8 +190,9 @@ sycl::event tan_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = TanContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = TanContigHyperparameterSet::n_vecs; + using TanHS = hyperparam_detail::TanContigHyperparameterSet; + constexpr std::uint8_t vec_sz = TanHS::vec_sz; + constexpr std::uint8_t n_vecs = TanHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp index 884dc3bece..1d06fd3c4f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp @@ -155,7 +155,7 @@ template struct TanhOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -172,7 +172,7 @@ template struct TanhContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class tanh_contig_kernel; @@ -184,8 +184,9 @@ sycl::event tanh_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = TanhContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = TanhContigHyperparameterSet::n_vecs; + using TanhHS = hyperparam_detail::TanhContigHyperparameterSet; + constexpr std::uint8_t vec_sz = TanhHS::vec_sz; + constexpr std::uint8_t n_vecs = TanhHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index 55065ef9ad..de6c9a8723 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -180,7 +180,7 @@ template struct TrueDivideOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -198,7 +198,7 @@ struct TrueDivideContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template &depends = {}) { - constexpr std::uint8_t vec_sz = - TrueDivideContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - TrueDivideContigHyperparameterSet::n_vecs; + using DivHS = + hyperparam_detail::TrueDivideContigHyperparameterSet; + constexpr std::uint8_t vec_sz = DivHS::vec_sz; + constexpr std::uint8_t n_vecs = DivHS::n_vecs; return elementwise_common::binary_contig_impl< argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor, @@ -538,10 +538,10 @@ true_divide_inplace_contig_impl(sycl::queue &exec_q, ssize_t res_offset, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = - TrueDivideContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = - TrueDivideContigHyperparameterSet::vec_sz; + using DivHS = + hyperparam_detail::TrueDivideContigHyperparameterSet; + constexpr std::uint8_t vec_sz = DivHS::vec_sz; + constexpr std::uint8_t n_vecs = DivHS::vec_sz; return elementwise_common::binary_inplace_contig_impl< argTy, resTy, TrueDivideInplaceContigFunctor, diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp index f7d2bbc596..008c5f59b1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -112,7 +112,7 @@ template struct TruncOutputType static constexpr bool is_defined = !std::is_same_v; }; -namespace +namespace hyperparam_detail { namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils; @@ -129,7 +129,7 @@ template struct TruncContigHyperparameterSet constexpr static auto n_vecs = value_type::n_vecs; }; -} // end of anonymous namespace +} // end of namespace hyperparam_detail template class trunc_contig_kernel; @@ -141,8 +141,9 @@ sycl::event trunc_contig_impl(sycl::queue &exec_q, char *res_p, const std::vector &depends = {}) { - constexpr std::uint8_t vec_sz = TruncContigHyperparameterSet::vec_sz; - constexpr std::uint8_t n_vecs = TruncContigHyperparameterSet::n_vecs; + using TruncHS = hyperparam_detail::TruncContigHyperparameterSet; + constexpr std::uint8_t vec_sz = TruncHS::vec_sz; + constexpr std::uint8_t n_vecs = TruncHS::n_vecs; return elementwise_common::unary_contig_impl< argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz, From 87052e2a74a7265522e7216d14c1d0e0d0b67ac2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:00:57 -0600 Subject: [PATCH 02/11] Replace use of anonymous namespace in headers Using anonymous namespace in headers is against best C++ practices due to internal linkage of entities in that namespace. --- .../tensor/libtensor/include/kernels/accumulators.hpp | 8 ++++---- .../include/kernels/boolean_advanced_indexing.hpp | 10 +++++----- .../libtensor/include/utils/sycl_alloc_utils.hpp | 6 +++--- dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 9 +++++---- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index c5231136bf..e46e3ceddb 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -90,7 +90,7 @@ template struct can_use_inclusive_scan_over_group static constexpr bool value = sycl::has_known_identity::value; }; -namespace +namespace detail { template class stack_t { @@ -141,7 +141,7 @@ template class stack_strided_t std::size_t get_local_stride() const { return local_stride_; } }; -} // end of anonymous namespace +} // end of namespace detail // Iterative cumulative summation @@ -627,7 +627,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, throw std::bad_alloc(); } - std::vector> stack{}; + std::vector> stack{}; // inclusive scans over blocks n_groups_ = n_groups; @@ -867,7 +867,7 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, throw std::bad_alloc(); } - std::vector> stack{}; + std::vector> stack{}; // inclusive scans over blocks acc_groups_ = acc_groups; diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 74c691d416..4384a36e4c 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -212,7 +212,7 @@ struct MaskedPlaceStridedFunctor // ======= Masked extraction ================================ -namespace +namespace detail { template @@ -234,7 +234,7 @@ std::size_t get_lws(std::size_t n) return _get_lws_impl(n); } -} // end of anonymous namespace +} // end of namespace detail template class masked_extract_all_slices_contig_impl_krn; @@ -278,7 +278,7 @@ sycl::event masked_extract_all_slices_contig_impl( const std::size_t masked_extent = iteration_size; - const std::size_t lws = get_lws(masked_extent); + const std::size_t lws = detail::get_lws(masked_extent); const std::size_t n_groups = (iteration_size + lws - 1) / lws; @@ -357,7 +357,7 @@ sycl::event masked_extract_all_slices_strided_impl( const std::size_t masked_nelems = iteration_size; - const std::size_t lws = get_lws(masked_nelems); + const std::size_t lws = detail::get_lws(masked_nelems); const std::size_t n_groups = (masked_nelems + lws - 1) / lws; @@ -452,7 +452,7 @@ sycl::event masked_extract_some_slices_strided_impl( const std::size_t masked_extent = masked_nelems; - const std::size_t lws = get_lws(masked_extent); + const std::size_t lws = detail::get_lws(masked_extent); const std::size_t n_groups = ((masked_extent + lws - 1) / lws); const std::size_t orthog_extent = static_cast(orthog_nelems); diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index fa9f23015c..1dcbac2b1d 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -142,7 +142,7 @@ smart_malloc_host(std::size_t count, return smart_malloc(count, q, sycl::usm::alloc::host, propList); } -namespace +namespace detail { template struct valid_smart_ptr : public std::false_type { @@ -172,7 +172,7 @@ struct all_valid_smart_ptrs static constexpr bool value = valid_smart_ptr::value && (all_valid_smart_ptrs::value); }; -} // namespace +} // end of namespace detail template sycl::event async_smart_free(sycl::queue &exec_q, @@ -184,7 +184,7 @@ sycl::event async_smart_free(sycl::queue &exec_q, n > 0, "async_smart_free requires at least one smart pointer argument"); static_assert( - all_valid_smart_ptrs::value, + detail::all_valid_smart_ptrs::value, "async_smart_free requires unique_ptr created with smart_malloc"); std::vector ptrs; diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index cc229c9a88..ece8852643 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -132,7 +132,7 @@ std::size_t choose_workgroup_size(const std::size_t nelems, return wg; } -namespace +namespace detail { template @@ -158,7 +158,7 @@ void _fold(LocAccT &local_mem_acc, } } -} // namespace +} // end of namespace detail template T custom_reduce_over_group(const GroupT &wg, @@ -184,7 +184,8 @@ T custom_reduce_over_group(const GroupT &wg, for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) { if (n_witems >= sz) { const std::uint32_t n_witems_ = (n_witems + 1) >> 1; - _fold(local_mem_acc, lid, n_witems - n_witems_, n_witems_, op); + detail::_fold(local_mem_acc, lid, n_witems - n_witems_, + n_witems_, op); sycl::group_barrier(wg, sycl::memory_scope::work_group); n_witems = n_witems_; } @@ -196,7 +197,7 @@ T custom_reduce_over_group(const GroupT &wg, for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) { if (n_witems >= sz) { n_witems >>= 1; - _fold(local_mem_acc, lid, n_witems, op); + detail::_fold(local_mem_acc, lid, n_witems, op); sycl::group_barrier(wg, sycl::memory_scope::work_group); } } From 869faef6964bec1a5414e23e55616923fffa476f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:06:31 -0600 Subject: [PATCH 03/11] Reduced number of created iota and map_back kernels Avoid using comparator type to form kernel name types for iota and map_back kernels (as they do not depedent on comparator). This reduces the number of kernels generated during instantiation of template implementation functions. --- .../include/kernels/sorting/merge_sort.hpp | 11 ++-- .../include/kernels/sorting/radix_sort.hpp | 4 +- .../include/kernels/sorting/topk.hpp | 50 ++++++------------- .../tensor/libtensor/source/sorting/topk.cpp | 34 +++---------- 4 files changed, 29 insertions(+), 70 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp index 577b319f11..cf4e8ebb9a 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -211,8 +211,6 @@ void merge_impl(const std::size_t offset, } } -namespace -{ template void insertion_sort_impl(Iter first, const std::size_t begin, @@ -259,7 +257,6 @@ void leaf_sort_impl(Iter first, return insertion_sort_impl( std::move(first), std::move(begin), std::move(end), std::move(comp)); } -} // namespace template struct GetValueType { @@ -768,9 +765,9 @@ sycl::event stable_sort_axis1_contig_impl( } } -template class populate_index_data_krn; +template class populate_index_data_krn; -template class index_map_to_rows_krn; +template class index_map_to_rows_krn; template struct IndexComp { @@ -820,7 +817,7 @@ sycl::event stable_argsort_axis1_contig_impl( using dpctl::tensor::kernels::sort_utils_detail::iota_impl; - using IotaKernelName = populate_index_data_krn; + using IotaKernelName = populate_index_data_krn; sycl::event populate_indexed_data_ev = iota_impl( exec_q, res_tp, total_nelems, depends); @@ -838,7 +835,7 @@ sycl::event stable_argsort_axis1_contig_impl( exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size, {base_sort_ev}); - using MapBackKernelName = index_map_to_rows_krn; + using MapBackKernelName = index_map_to_rows_krn; using dpctl::tensor::kernels::sort_utils_detail::map_back_impl; sycl::event write_out_ev = map_back_impl( diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index 8e54e0a0c8..a21fa9792b 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -1759,6 +1759,8 @@ template struct ValueProj template struct IndexedProj { + IndexedProj(const ValueT *arg_ptr) : ptr(arg_ptr), value_projector{} {} + IndexedProj(const ValueT *arg_ptr, const ProjT &proj_op) : ptr(arg_ptr), value_projector(proj_op) { @@ -1848,7 +1850,7 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q, using IdentityProjT = radix_sort_details::IdentityProj; using IndexedProjT = radix_sort_details::IndexedProj; - const IndexedProjT proj_op{arg_tp, IdentityProjT{}}; + const IndexedProjT proj_op{arg_tp}; using IotaKernelName = radix_argsort_iota_krn; diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index dd98e6e20d..fc4e08064d 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -32,6 +32,7 @@ #include #include +#include #include #include "kernels/dpctl_tensor_types.hpp" @@ -40,7 +41,6 @@ #include "kernels/sorting/search_sorted_detail.hpp" #include "kernels/sorting/sort_utils.hpp" #include "utils/sycl_alloc_utils.hpp" -#include namespace dpctl { @@ -134,11 +134,9 @@ sycl::event write_out_impl(sycl::queue &exec_q, } // namespace topk_detail -template -class topk_populate_index_data_krn; +template class topk_populate_index_data_krn; -template -class topk_full_merge_map_back_krn; +template class topk_full_merge_map_back_krn; template sycl::event @@ -158,7 +156,7 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, // extract USM pointer IndexTy *index_data = index_data_owner.get(); - using IotaKernelName = topk_populate_index_data_krn; + using IotaKernelName = topk_populate_index_data_krn; using dpctl::tensor::kernels::sort_utils_detail::iota_impl; @@ -179,8 +177,7 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size, {base_sort_ev}); - using WriteOutKernelName = - topk_full_merge_map_back_krn; + using WriteOutKernelName = topk_full_merge_map_back_krn; sycl::event write_out_ev = topk_detail::write_out_impl( @@ -194,8 +191,7 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, return cleanup_host_task_event; }; -template -class topk_partial_merge_map_back_krn; +template class topk_partial_merge_map_back_krn; template class topk_over_work_group_krn; @@ -213,24 +209,15 @@ sycl::event topk_merge_impl( const char *arg_cp, char *vals_cp, char *inds_cp, - dpctl::tensor::ssize_t iter_arg_offset, - dpctl::tensor::ssize_t iter_vals_offset, - dpctl::tensor::ssize_t iter_inds_offset, - dpctl::tensor::ssize_t axis_arg_offset, - dpctl::tensor::ssize_t axis_vals_offset, - dpctl::tensor::ssize_t axis_inds_offset, const std::vector &depends) { if (axis_nelems < k) { throw std::runtime_error("Invalid sort axis size for value of k"); } - const argTy *arg_tp = reinterpret_cast(arg_cp) + - iter_arg_offset + axis_arg_offset; - argTy *vals_tp = reinterpret_cast(vals_cp) + iter_vals_offset + - axis_vals_offset; - IndexTy *inds_tp = reinterpret_cast(inds_cp) + iter_inds_offset + - axis_inds_offset; + const argTy *arg_tp = reinterpret_cast(arg_cp); + argTy *vals_tp = reinterpret_cast(vals_cp); + IndexTy *inds_tp = reinterpret_cast(inds_cp); using dpctl::tensor::kernels::IndexComp; const IndexComp index_comp{arg_tp, ValueComp{}}; @@ -434,7 +421,7 @@ sycl::event topk_merge_impl( // Write out top k of the merge-sorted memory using WriteOutKernelName = - topk_partial_merge_map_back_krn; + topk_partial_merge_map_back_krn; sycl::event write_topk_ev = topk_detail::write_out_impl( @@ -462,24 +449,15 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, const char *arg_cp, char *vals_cp, char *inds_cp, - dpctl::tensor::ssize_t iter_arg_offset, - dpctl::tensor::ssize_t iter_vals_offset, - dpctl::tensor::ssize_t iter_inds_offset, - dpctl::tensor::ssize_t axis_arg_offset, - dpctl::tensor::ssize_t axis_vals_offset, - dpctl::tensor::ssize_t axis_inds_offset, const std::vector &depends) { if (axis_nelems < k) { throw std::runtime_error("Invalid sort axis size for value of k"); } - const argTy *arg_tp = reinterpret_cast(arg_cp) + - iter_arg_offset + axis_arg_offset; - argTy *vals_tp = reinterpret_cast(vals_cp) + iter_vals_offset + - axis_vals_offset; - IndexTy *inds_tp = reinterpret_cast(inds_cp) + iter_inds_offset + - axis_inds_offset; + const argTy *arg_tp = reinterpret_cast(arg_cp); + argTy *vals_tp = reinterpret_cast(vals_cp); + IndexTy *inds_tp = reinterpret_cast(inds_cp); const std::size_t total_nelems = iter_nelems * axis_nelems; const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64; @@ -494,7 +472,7 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, using IdentityProjT = radix_sort_details::IdentityProj; using IndexedProjT = radix_sort_details::IndexedProj; - const IndexedProjT proj_op{arg_tp, IdentityProjT{}}; + const IndexedProjT proj_op{arg_tp}; using IotaKernelName = topk_iota_krn; diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp index cd7212c05f..84108fd6cd 100644 --- a/dpctl/tensor/libtensor/source/sorting/topk.cpp +++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp @@ -46,8 +46,6 @@ #include "rich_comparisons.hpp" #include "topk.hpp" -namespace td_ns = dpctl::tensor::type_dispatch; - namespace dpctl { namespace tensor @@ -55,6 +53,8 @@ namespace tensor namespace py_internal { +namespace td_ns = dpctl::tensor::type_dispatch; + typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &, std::size_t, std::size_t, @@ -63,12 +63,6 @@ typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &, const char *, char *, char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, const std::vector &); static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types]; @@ -102,21 +96,14 @@ sycl::event topk_caller(sycl::queue &exec_q, const char *arg_cp, char *vals_cp, char *inds_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_vals_offset, - py::ssize_t iter_inds_offset, - py::ssize_t axis_arg_offset, - py::ssize_t axis_vals_offset, - py::ssize_t axis_inds_offset, const std::vector &depends) { if constexpr (use_radix_sort::value) { using dpctl::tensor::kernels::topk_radix_impl; auto ascending = !largest; - return topk_radix_impl( - exec_q, iter_nelems, axis_nelems, k, ascending, arg_cp, vals_cp, - inds_cp, iter_arg_offset, iter_vals_offset, iter_inds_offset, - axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + return topk_radix_impl(exec_q, iter_nelems, axis_nelems, + k, ascending, arg_cp, vals_cp, + inds_cp, depends); } else { using dpctl::tensor::kernels::topk_merge_impl; @@ -126,16 +113,14 @@ sycl::event topk_caller(sycl::queue &exec_q, argTy>::type; return topk_merge_impl( exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, - iter_arg_offset, iter_vals_offset, iter_inds_offset, - axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + depends); } else { using CompTy = typename dpctl::tensor::py_internal::AscendingSorter< argTy>::type; return topk_merge_impl( exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, - iter_arg_offset, iter_vals_offset, iter_inds_offset, - axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + depends); } } } @@ -268,14 +253,11 @@ py_topk(const dpctl::tensor::usm_ndarray &src, bool is_inds_c_contig = inds.is_c_contiguous(); if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) { - static constexpr py::ssize_t zero_offset = py::ssize_t(0); - auto fn = topk_dispatch_vector[src_typeid]; sycl::event comp_ev = fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(), - vals.get_data(), inds.get_data(), zero_offset, zero_offset, - zero_offset, zero_offset, zero_offset, zero_offset, depends); + vals.get_data(), inds.get_data(), depends); sycl::event keep_args_alive_ev = dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev}); From 80f288ccafe8de960e07effe19cdef64df5784b9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:22:02 -0600 Subject: [PATCH 04/11] Move vectors ptrs and dels into host_task lambda Since vectors `ptrs` and `dels` are no longer needed after host_task submission, we might as well avoid the copying and use std::move in lambda capture initialization. Also renamed `Args` template pack to `UniquePtrTs`, and `args` template argument to `unique_ptrs`. Added comments next to each include to note the entity which requires it. --- .../include/utils/sycl_alloc_utils.hpp | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index 1dcbac2b1d..1e6706e743 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -26,12 +26,13 @@ #pragma once -#include -#include -#include -#include -#include -#include +#include // for std::size_t +#include // for std::exception +#include // for std::cerr +#include // for std::unique_ptr +#include // for std::runtime_error +#include // for std::true_type, std::false_type +#include // for std::move #include #include "sycl/sycl.hpp" @@ -174,37 +175,38 @@ struct all_valid_smart_ptrs }; } // end of namespace detail -template +/*! @brief Submit host_task and transfer ownership from smart pointers to it */ +template sycl::event async_smart_free(sycl::queue &exec_q, const std::vector &depends, - Args &&...args) + UniquePtrTs &&...unique_pointers) { - constexpr std::size_t n = sizeof...(Args); + constexpr std::size_t n = sizeof...(UniquePtrTs); static_assert( n > 0, "async_smart_free requires at least one smart pointer argument"); static_assert( - detail::all_valid_smart_ptrs::value, + detail::all_valid_smart_ptrs::value, "async_smart_free requires unique_ptr created with smart_malloc"); std::vector ptrs; ptrs.reserve(n); - (ptrs.push_back(reinterpret_cast(args.get())), ...); + (ptrs.push_back(reinterpret_cast(unique_pointers.get())), ...); std::vector dels; dels.reserve(n); - (dels.push_back(args.get_deleter()), ...); + (dels.emplace_back(unique_pointers.get_deleter()), ...); sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.host_task([ptrs, dels]() { + cgh.host_task([ptrs = std::move(ptrs), dels = std::move(dels)]() { for (std::size_t i = 0; i < ptrs.size(); ++i) { dels[i](ptrs[i]); } }); }); - (args.release(), ...); + (unique_pointers.release(), ...); return ht_e; } From 08cfede8096c66c271f0ddf3b5e963f870f50d1d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:25:32 -0600 Subject: [PATCH 05/11] Moved local update kernels to separate function which take fewer template params Removed unncessary template parameters from kernel names submitted by these functions. As a consequence, the size of `_tensor_accumulation_impl` shared object reduced from 49'360'152 bytes to 36'422'888, that is, by almost 13MB. --- .../include/kernels/accumulators.hpp | 467 ++++++++---------- 1 file changed, 216 insertions(+), 251 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index e46e3ceddb..18e8a9c74c 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -171,15 +170,6 @@ template class inclusive_scan_iter_local_scan_striped_krn; -template -class inclusive_scan_iter_chunk_update_krn; - template +class inclusive_scan_1d_iter_chunk_update_krn; + +template -class inclusive_scan_1d_iter_chunk_update_krn; + typename ScanOpT> +sycl::event update_local_chunks_1d(sycl::queue &exec_q, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + const sycl::event &dependent_event) +{ + const auto &ctx = exec_q.get_context(); + const auto &dev = exec_q.get_device(); + + const auto &kernel_id = sycl::get_kernel_id(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + // output[ chunk_size * (i + 1) + j] += temp[i] + sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + cgh.use_kernel_bundle(kb); + + constexpr nwiT updates_per_wi = n_wi; + const std::size_t n_items = + ceiling_quotient(src_size, sg_size * n_wi) * sg_size; + + sycl::range<1> gRange{n_items}; + sycl::range<1> lRange{sg_size}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + cgh.parallel_for( + ndRange, + [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) { + constexpr ScanOpT scan_op{}; + constexpr outputT identity = + su_ns::Identity::value; + + const std::uint32_t lws = ndit.get_local_range(0); + const std::size_t block_offset = ndit.get_group(0) * n_wi * lws; +#pragma unroll + for (std::size_t i = 0; i < updates_per_wi; ++i) { + const std::size_t src_id = + block_offset + ndit.get_local_id(0) + i * lws; + if (src_id < src_size) { + const std::size_t scan_id = (src_id / chunk_size); + const outputT modifier = + (scan_id > 0) ? local_scans[scan_id - 1] : identity; + src[src_id] = scan_op(src[src_id], modifier); + } + } + }); + }); + + return update_event; +} /* * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) @@ -621,11 +665,10 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, } // allocate - outputT *temp = sycl::malloc_device(temp_size, exec_q); - - if (!temp) { - throw std::bad_alloc(); - } + auto temp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(temp_size, + exec_q); + outputT *temp = temp_owner.get(); std::vector> stack{}; @@ -663,72 +706,21 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, const auto &stack_elem = stack[stack_id]; outputT *src = stack_elem.get_src_ptr(); const std::size_t src_size = stack_elem.get_size(); - outputT *local_scans = stack_elem.get_local_scans_ptr(); + const outputT *local_scans = stack_elem.get_local_scans_ptr(); using UpdateKernelName = - class inclusive_scan_1d_iter_chunk_update_krn< - inputT, outputT, n_wi, IndexerT, TransformerT, - NoOpTransformerT, ScanOpT, include_initial>; - - const auto &kernel_id = sycl::get_kernel_id(); - - auto const &ctx = exec_q.get_context(); - auto const &dev = exec_q.get_device(); - auto kb = sycl::get_kernel_bundle( - ctx, {dev}, {kernel_id}); - - auto krn = kb.get_kernel(kernel_id); - - const std::uint32_t sg_size = krn.template get_info< - sycl::info::kernel_device_specific::max_sub_group_size>(dev); - - // output[ chunk_size * (i + 1) + j] += temp[i] - dependent_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_event); - cgh.use_kernel_bundle(kb); - - constexpr nwiT updates_per_wi = n_wi; - const std::size_t n_items = - ceiling_quotient(src_size, sg_size * n_wi) * - sg_size; - - sycl::range<1> gRange{n_items}; - sycl::range<1> lRange{sg_size}; - sycl::nd_range<1> ndRange{gRange, lRange}; - - cgh.parallel_for( - ndRange, [chunk_size, src, src_size, - local_scans](sycl::nd_item<1> ndit) { - constexpr ScanOpT scan_op{}; - constexpr outputT identity = - su_ns::Identity::value; - - const std::uint32_t lws = ndit.get_local_range(0); - const std::size_t block_offset = - ndit.get_group(0) * n_wi * lws; -#pragma unroll - for (std::size_t i = 0; i < updates_per_wi; ++i) { - const std::size_t src_id = - block_offset + ndit.get_local_id(0) + i * lws; - if (src_id < src_size) { - const std::size_t scan_id = - (src_id / chunk_size); - const outputT modifier = - (scan_id > 0) ? local_scans[scan_id - 1] - : identity; - src[src_id] = scan_op(src[src_id], modifier); - } - } - }); - }); + class inclusive_scan_1d_iter_chunk_update_krn; + + dependent_event = update_local_chunks_1d( + exec_q, src, src_size, local_scans, chunk_size, + dependent_event); } - sycl::event free_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp]() { sycl_free_noexcept(temp, ctx); }); - }); + sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dependent_event}, temp_owner); + host_tasks.push_back(free_ev); } @@ -792,17 +784,120 @@ accumulate_1d_contig_impl(sycl::queue &q, return comp_ev; } -template + typename ScanOpT> class inclusive_scan_final_chunk_update_krn; +template +sycl::event final_update_local_chunks(sycl::queue &exec_q, + std::size_t iter_nelems, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + std::size_t local_stride, + const OutIterIndexerT &out_iter_indexer, + const OutIndexerT &out_indexer, + sycl::event dependent_event) +{ + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + constexpr nwiT updates_per_wi = n_wi; + const std::size_t updates_per_sg = sg_size * updates_per_wi; + const std::size_t update_nelems = + ceiling_quotient(src_size, updates_per_sg) * sg_size; + + sycl::range<2> gRange{iter_nelems, update_nelems}; + sycl::range<2> lRange{1, sg_size}; + + sycl::nd_range<2> ndRange{gRange, lRange}; + + sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + cgh.parallel_for( + ndRange, [chunk_size, src_size, local_stride, src, local_scans, + out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) { + constexpr ScanOpT scan_op{}; + constexpr outputT identity = + su_ns::Identity::value; + + const std::uint32_t lws = ndit.get_local_range(1); + + const std::size_t iter_gid = ndit.get_group(0); + + const std::size_t src_axis_id0 = + ndit.get_group(1) * updates_per_wi * lws + + ndit.get_local_id(1); + const std::size_t src_iter_id = out_iter_indexer(iter_gid); +#pragma unroll + for (nwiT i = 0; i < updates_per_wi; ++i) { + const std::size_t src_axis_id = src_axis_id0 + i * lws; + const std::size_t src_id = + out_indexer(src_axis_id) + src_iter_id; + + if (src_axis_id < src_size) { + const std::size_t scan_axis_id = + src_axis_id / chunk_size; + const std::size_t scan_id = + scan_axis_id + iter_gid * local_stride; + + const outputT modifier = (scan_axis_id > 0) + ? local_scans[scan_id - 1] + : identity; + + src[src_id] = scan_op(src[src_id], modifier); + } + } + }); + }); + + return update_event; +} + +template +class inclusive_scan_iter_chunk_update_krn; + +template +sycl::event update_local_chunks(sycl::queue &exec_q, + std::size_t iter_nelems, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + std::size_t local_stride, + sycl::event dependent_event) +{ + constexpr NoOpIndexer out_indexer{}; + constexpr NoOpIndexer iter_out_indexer{}; + + return final_update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, iter_out_indexer, out_indexer, dependent_event); +} + template (iter_nelems * temp_size, exec_q); - - if (!temp) { - throw std::bad_alloc(); - } + auto temp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * temp_size, exec_q); + outputT *temp = temp_owner.get(); std::vector> stack{}; @@ -951,76 +1044,14 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, outputT *local_scans = stack_elem.get_local_scans_ptr(); std::size_t local_stride = stack_elem.get_local_stride(); - using UpdateKernelName = class inclusive_scan_iter_chunk_update_krn< - inputT, outputT, n_wi, TransformerT, NoOpTransformerT, ScanOpT, - include_initial>; - - const auto &kernel_id = sycl::get_kernel_id(); - - auto const &ctx = exec_q.get_context(); - auto const &dev = exec_q.get_device(); - auto kb = sycl::get_kernel_bundle( - ctx, {dev}, {kernel_id}); - - auto krn = kb.get_kernel(kernel_id); - - const std::uint32_t sg_size = krn.template get_info< - sycl::info::kernel_device_specific::max_sub_group_size>(dev); - - constexpr nwiT updates_per_wi = n_wi; - const std::size_t update_nelems = - ceiling_quotient(src_size, - sg_size * updates_per_wi) * - sg_size; - - dependent_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_event); - cgh.use_kernel_bundle(kb); - - sycl::range<2> gRange{iter_nelems, update_nelems}; - sycl::range<2> lRange{1, sg_size}; - - sycl::nd_range<2> ndRange{gRange, lRange}; - - cgh.parallel_for( - ndRange, [chunk_size, src_size, local_stride, src, - local_scans](sycl::nd_item<2> ndit) { - constexpr ScanOpT scan_op{}; - constexpr outputT identity = - su_ns::Identity::value; - - const std::size_t iter_gid = ndit.get_group(0); - const std::size_t axis_gr_id = ndit.get_group(1); - - const std::uint32_t lws = ndit.get_local_range(0); + using UpdateKernelName = + class inclusive_scan_iter_chunk_update_krn; - const std::size_t src_axis_id0 = - axis_gr_id * updates_per_wi * lws; - const std::size_t src_iter_id = iter_gid * src_size; - const std::size_t scan_id0 = iter_gid * local_stride; -#pragma unroll - for (nwiT i = 0; i < updates_per_wi; ++i) { - const std::size_t src_axis_id = - src_axis_id0 + ndit.get_local_id(0) + i * lws; - - if (src_axis_id < src_size) { - const std::size_t scan_axis_id = - src_axis_id / chunk_size; - const std::size_t scan_id = - scan_axis_id + scan_id0; - - const outputT modifier = - (scan_axis_id > 0) - ? local_scans[scan_id - 1] - : identity; - - const std::size_t src_id = - src_axis_id + src_iter_id; - src[src_id] = scan_op(src[src_id], modifier); - } - } - }); - }); + dependent_event = + update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, dependent_event); } // last stack element is always directly to output @@ -1033,83 +1064,19 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, using UpdateKernelName = class inclusive_scan_final_chunk_update_krn< - inputT, outputT, n_wi, OutIterIndexerT, OutIndexerT, - TransformerT, NoOpTransformerT, ScanOpT, include_initial>; - - const auto &kernel_id = sycl::get_kernel_id(); - - auto const &ctx = exec_q.get_context(); - auto const &dev = exec_q.get_device(); - auto kb = sycl::get_kernel_bundle( - ctx, {dev}, {kernel_id}); - - auto krn = kb.get_kernel(kernel_id); - - const std::uint32_t sg_size = krn.template get_info< - sycl::info::kernel_device_specific::max_sub_group_size>(dev); - - constexpr nwiT updates_per_wi = n_wi; - const std::size_t update_nelems = - ceiling_quotient(src_size, - sg_size * updates_per_wi) * - sg_size; + outputT, n_wi, OutIterIndexerT, OutIndexerT, ScanOpT>; - sycl::range<2> gRange{iter_nelems, update_nelems}; - sycl::range<2> lRange{1, sg_size}; - - sycl::nd_range<2> ndRange{gRange, lRange}; - - dependent_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_event); - - cgh.parallel_for( - ndRange, - [chunk_size, src_size, local_stride, src, local_scans, - out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) { - constexpr ScanOpT scan_op{}; - constexpr outputT identity = - su_ns::Identity::value; - - const std::uint32_t lws = ndit.get_local_range(1); - - const std::size_t iter_gid = ndit.get_group(0); - - const std::size_t src_axis_id0 = - ndit.get_group(1) * updates_per_wi * lws + - ndit.get_local_id(1); - const std::size_t src_iter_id = - out_iter_indexer(iter_gid); -#pragma unroll - for (nwiT i = 0; i < updates_per_wi; ++i) { - const std::size_t src_axis_id = - src_axis_id0 + i * lws; - const std::size_t src_id = - out_indexer(src_axis_id) + src_iter_id; - - if (src_axis_id < src_size) { - const std::size_t scan_axis_id = - src_axis_id / chunk_size; - const std::size_t scan_id = - scan_axis_id + iter_gid * local_stride; - - const outputT modifier = - (scan_axis_id > 0) - ? local_scans[scan_id - 1] - : identity; - - src[src_id] = scan_op(src[src_id], modifier); - } - } - }); - }); + dependent_event = + final_update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, out_iter_indexer, out_indexer, + dependent_event); } - sycl::event free_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp]() { sycl_free_noexcept(temp, ctx); }); - }); + sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dependent_event}, temp_owner); host_tasks.push_back(free_ev); } @@ -1255,11 +1222,10 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q, } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); - cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + auto host_usm_owner = + dpctl::tensor::alloc_utils::smart_malloc_host(1, q); + cumsumT *last_elem_host_usm = host_usm_owner.get(); - if (last_elem_host_usm == nullptr) { - throw std::bad_alloc(); - } sycl::event copy_e = q.submit([&](sycl::handler &cgh) { cgh.depends_on(comp_ev); cgh.copy(last_elem, last_elem_host_usm, 1); @@ -1267,8 +1233,8 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q, copy_e.wait(); std::size_t return_val = static_cast(*last_elem_host_usm); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(last_elem_host_usm, q); + // free USM host allocation + host_usm_owner.reset(nullptr); return return_val; } @@ -1370,11 +1336,10 @@ cumsum_val_strided_impl(sycl::queue &q, cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); - cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + auto host_usm_owner = + dpctl::tensor::alloc_utils::smart_malloc_host(1, q); + cumsumT *last_elem_host_usm = host_usm_owner.get(); - if (last_elem_host_usm == nullptr) { - throw std::bad_alloc(); - } sycl::event copy_e = q.submit([&](sycl::handler &cgh) { cgh.depends_on(comp_ev); cgh.copy(last_elem, last_elem_host_usm, 1); @@ -1382,8 +1347,8 @@ cumsum_val_strided_impl(sycl::queue &q, copy_e.wait(); std::size_t return_val = static_cast(*last_elem_host_usm); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(last_elem_host_usm, q); + // free USM-host temporary + host_usm_owner.reset(nullptr); return return_val; } From 8c167bfb8290404898e9394428d54a2506d6b120 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:46:43 -0600 Subject: [PATCH 06/11] Add comment before call to unique_ptr::release method --- dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index 1e6706e743..73a09011a0 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -206,6 +206,10 @@ sycl::event async_smart_free(sycl::queue &exec_q, } }); }); + + // Upon successful submission of host_task, USM allocations are owned + // by the host_task. Release smart pointer ownership to avoid double + // deallocation (unique_pointers.release(), ...); return ht_e; From ce02c6ceca7f4f5e873eee8f702c9356e9715f49 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:48:11 -0600 Subject: [PATCH 07/11] Add comments explaining intention of unique_ptr::reset call --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 18e8a9c74c..731807c7d0 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -1233,7 +1233,8 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q, copy_e.wait(); std::size_t return_val = static_cast(*last_elem_host_usm); - // free USM host allocation + // explicitly free USM host allocation, by envoking deleter of + // the unique_ptr host_usm_owner.reset(nullptr); return return_val; @@ -1347,7 +1348,8 @@ cumsum_val_strided_impl(sycl::queue &q, copy_e.wait(); std::size_t return_val = static_cast(*last_elem_host_usm); - // free USM-host temporary + // explicitly free USM-host temporary, by envoking deleter of + // the unique_ptr host_usm_owner.reset(nullptr); return return_val; From fea54b60c94c7cd59ac628405f578e7149a5ef89 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:50:02 -0600 Subject: [PATCH 08/11] Replace sycl::malloc_device with smart_malloc_device Direct calls to host_task to asynchronously deallocate USM temporary are replaced with call to async_smart_free which submits the host_task for us and transfers allocation ownership from smart pointer to the host task. --- .../kernels/elementwise_functions/common.hpp | 37 +- .../elementwise_functions/common_inplace.hpp | 17 +- .../kernels/linalg_functions/dot_product.hpp | 63 +-- .../include/kernels/linalg_functions/gemm.hpp | 397 ++++++------------ .../libtensor/include/kernels/reductions.hpp | 239 ++++------- 5 files changed, 256 insertions(+), 497 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 6e1119c294..8aa06f0070 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -912,11 +912,11 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl( *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); std::size_t n1_padded = n1 + max_sgSize; - argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); + auto padded_vec_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(n1_padded, + exec_q); + argT2 *padded_vec = padded_vec_owner.get(); - if (padded_vec == nullptr) { - throw std::runtime_error("Could not allocate memory on the device"); - } sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); // ensure vec contains actual data cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { @@ -948,13 +948,9 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl( mat, padded_vec, res, n_elems, n1)); }); - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const sycl::context &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, padded_vec_owner); + host_tasks.push_back(tmp_cleanup_ev); return comp_ev; @@ -992,11 +988,10 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl( *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); std::size_t n1_padded = n1 + max_sgSize; - argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); - - if (padded_vec == nullptr) { - throw std::runtime_error("Could not allocate memory on the device"); - } + auto padded_vec_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(n1_padded, + exec_q); + argT2 *padded_vec = padded_vec_owner.get(); sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); // ensure vec contains actual data @@ -1029,13 +1024,9 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl( padded_vec, mat, res, n_elems, n1)); }); - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const sycl::context &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, padded_vec_owner); + host_tasks.push_back(tmp_cleanup_ev); return comp_ev; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index c7941094d7..8d93da3f8c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -423,11 +423,11 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); std::size_t n1_padded = n1 + max_sgSize; - argT *padded_vec = sycl::malloc_device(n1_padded, exec_q); + auto padded_vec_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(n1_padded, + exec_q); + argT *padded_vec = padded_vec_owner.get(); - if (padded_vec == nullptr) { - throw std::runtime_error("Could not allocate memory on the device"); - } sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); // ensure vec contains actual data cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { @@ -459,13 +459,8 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( n_elems, n1)); }); - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const sycl::context &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, padded_vec]() { sycl_free_noexcept(padded_vec, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, padded_vec_owner); host_tasks.push_back(tmp_cleanup_ev); return comp_ev; diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp index 4f2e693779..97a8c3f165 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp @@ -1026,18 +1026,15 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - batches * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + // returns unique_ptr + auto partially_reduced_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + batches * (reduction_groups + second_iter_reduction_groups_), + exec_q); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * batches; - } + resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * batches; sycl::event first_reduction_ev; { @@ -1152,16 +1149,10 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, remaining_reduction_nelems, reductions_per_wi, reduction_groups, in_out_iter_indexer, reduction_indexer, {dependent_ev}); + // transfer ownership of USM allocation to host_task sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, partially_reduced_tmp_owner); return cleanup_host_task_event; } @@ -1282,18 +1273,15 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - batches * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; - - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * batches; - } + // unique_ptr that owns temporary allocation for partial reductions + auto partially_reduced_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + batches * (reduction_groups + second_iter_reduction_groups_), + exec_q); + // get raw pointers + resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * batches; sycl::event first_reduction_ev; { @@ -1401,15 +1389,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, in_out_iter_indexer, reduction_indexer, {dependent_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, partially_reduced_tmp_owner); return cleanup_host_task_event; } diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp index 7ae6f7d791..c4b47ca7f3 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp @@ -2343,11 +2343,10 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, dev.get_info() / 2); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; @@ -2390,31 +2389,28 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; - - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + const std::size_t tmp_alloc_size = + iter_nelems * ( + /* temp */ reduction_nelems + + /* first reduction temp */ reduction_groups); + + // get unique_ptr owning the temporary allocation + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + // get raw USM pointer + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + ; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; @@ -2455,15 +2451,8 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } @@ -2644,12 +2633,10 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); - - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; @@ -2692,31 +2679,26 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( + const std::size_t tmp_alloc_size = iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + /* first reduction temp */ reduction_groups); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; + ; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer; @@ -2761,15 +2743,8 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } @@ -3033,12 +3008,10 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); - - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -3073,31 +3046,25 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( + const std::size_t tmp_alloc_size = iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + /* first reduction temp */ reduction_groups); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -3133,15 +3100,8 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } @@ -3234,12 +3194,11 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + resTy *tmp = tmp_owner.get(); using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -3275,31 +3234,25 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( + const std::size_t tmp_alloc_size = iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + /* first reduction temp */ reduction_groups); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; @@ -3337,15 +3290,8 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } @@ -3621,12 +3567,11 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; constexpr ResIndexerT res_indexer{}; @@ -3645,31 +3590,24 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, res_shapes_strides, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( + const std::size_t tmp_alloc_size = iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + /* first reduction temp */ reduction_groups); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; constexpr ResIndexerT res_indexer{}; @@ -3689,15 +3627,8 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, res_nd, 0, res_shapes_strides, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } @@ -3789,12 +3720,10 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); - - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; constexpr ResIndexerT res_indexer{}; @@ -3813,31 +3742,24 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, res_shapes_strides, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( + const std::size_t tmp_alloc_size = iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + /* first reduction temp */ reduction_groups); + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; constexpr ResIndexerT res_indexer{}; @@ -3857,15 +3779,8 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, res_nd, 0, res_shapes_strides, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } @@ -4042,12 +3957,10 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); - - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -4063,31 +3976,23 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( + const std::size_t tmp_alloc_size = iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + /* first reduction temp */ reduction_groups); + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -4106,15 +4011,8 @@ sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } @@ -4197,12 +4095,10 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, std::size_t max_wg = reduction_detail::get_work_group_size(dev); if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { - resTy *tmp = sycl::malloc_device( - iter_nelems * reduction_nelems, exec_q); - - if (!tmp) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * reduction_nelems, exec_q); + resTy *tmp = tmp_owner.get(); sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -4219,31 +4115,23 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, preferred_reductions_per_wi, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, tmp] { sycl_free_noexcept(tmp, ctx); }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } else { assert(reduction_groups > 1); - resTy *partially_reduced_tmp = sycl::malloc_device( + const std::size_t tmp_alloc_size = iter_nelems * (/* temp */ reduction_nelems + - /* first reduction temp */ reduction_groups), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + /* first reduction temp */ reduction_groups); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_nelems * iter_nelems; - } + auto tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_nelems * iter_nelems; sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT, @@ -4261,15 +4149,8 @@ sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q, reductions_per_wi, {gemm_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev}, + tmp_owner); return cleanup_host_task_event; } diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 8e41ce5bf1..2e6ab27f08 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1246,18 +1246,15 @@ sycl::event reduction_over_group_temps_strided_impl( (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * iter_nelems; - } + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + ; sycl::event first_reduction_ev; { @@ -1376,15 +1373,8 @@ sycl::event reduction_over_group_temps_strided_impl( in_out_iter_indexer, reduction_indexer, {dependent_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); // FIXME: do not return host-task event // Instead collect all host-tasks to a list @@ -1503,18 +1493,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; - - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * iter_nelems; - } + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; sycl::event first_reduction_ev; { @@ -1618,15 +1603,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( in_out_iter_indexer, reduction_indexer, {dependent_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); // FIXME: do not return host-task event // Instead collect all host-tasks to a list @@ -1744,18 +1722,15 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * iter_nelems; - } + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; sycl::event first_reduction_ev; { @@ -1860,15 +1835,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( in_out_iter_indexer, reduction_indexer, {dependent_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, partially_reduced_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); // FIXME: do not return host-task event // Instead collect all host-tasks to a list @@ -2651,31 +2619,22 @@ sycl::event search_over_group_temps_strided_impl( (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * iter_nelems; - } + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; - argTy *partially_reduced_vals_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - argTy *partially_reduced_vals_tmp2 = nullptr; + auto val_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); - if (partially_reduced_vals_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_vals_tmp2 = - partially_reduced_vals_tmp + reduction_groups * iter_nelems; - } + argTy *partially_reduced_vals_tmp = val_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; sycl::event first_reduction_ev; { @@ -2799,17 +2758,8 @@ sycl::event search_over_group_temps_strided_impl( reduction_indexer, {dependent_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - sycl::context ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - sycl_free_noexcept(partially_reduced_vals_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner); // FIXME: do not return host-task event // Instead collect all host-tasks to a list @@ -2951,31 +2901,20 @@ sycl::event search_axis1_over_group_temps_contig_impl( (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; - - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * iter_nelems; - } - - argTy *partially_reduced_vals_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - argTy *partially_reduced_vals_tmp2 = nullptr; - - if (partially_reduced_vals_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_vals_tmp2 = - partially_reduced_vals_tmp + reduction_groups * iter_nelems; - } + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + auto val_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + argTy *partially_reduced_vals_tmp = val_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; sycl::event first_reduction_ev; { @@ -3090,17 +3029,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( reduction_indexer, {dependent_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - sycl::context ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - sycl_free_noexcept(partially_reduced_vals_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner); // FIXME: do not return host-task event // Instead collect all host-tasks to a list @@ -3235,31 +3165,21 @@ sycl::event search_axis0_over_group_temps_contig_impl( (reduction_groups + preferred_reductions_per_wi * wg - 1) / (preferred_reductions_per_wi * wg); - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * iter_nelems; - } + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; - argTy *partially_reduced_vals_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - argTy *partially_reduced_vals_tmp2 = nullptr; - - if (partially_reduced_vals_tmp == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } - else { - partially_reduced_vals_tmp2 = - partially_reduced_vals_tmp + reduction_groups * iter_nelems; - } + auto vals_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + argTy *partially_reduced_vals_tmp = vals_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; sycl::event first_reduction_ev; { @@ -3375,17 +3295,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( reduction_indexer, {dependent_ev}); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - sycl::context ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { - sycl_free_noexcept(partially_reduced_tmp, ctx); - sycl_free_noexcept(partially_reduced_vals_tmp, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, vals_tmp_owner); // FIXME: do not return host-task event // Instead collect all host-tasks to a list From 9841f9e14ae48091b5e393caccc26a38ae6401a0 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:55:33 -0600 Subject: [PATCH 09/11] Change signature of copy_and_cast_from_host_impl and copy_for_reshape_generic_impl to take packed shape/strides as const pointer --- .../include/kernels/copy_and_cast.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index bafa6ee7ec..3b977679e8 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -551,7 +551,7 @@ typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( sycl::queue &, std::size_t, int, - ssize_t *, + const ssize_t *, const char *, ssize_t, ssize_t, @@ -604,7 +604,7 @@ void copy_and_cast_from_host_impl( sycl::queue &q, std::size_t nelems, int nd, - ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *host_src_p, ssize_t src_offset, ssize_t src_min_nelem_offset, @@ -797,12 +797,12 @@ class GenericCopyForReshapeFunctor // define function type typedef sycl::event (*copy_for_reshape_fn_ptr_t)( sycl::queue &, - std::size_t, // num_elements - int, // src_nd - int, // dst_nd - ssize_t *, // packed shapes and strides - const char *, // src_data_ptr - char *, // dst_data_ptr + std::size_t, // num_elements + int, // src_nd + int, // dst_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + char *, // dst_data_ptr const std::vector &); /*! @@ -832,7 +832,7 @@ copy_for_reshape_generic_impl(sycl::queue &q, std::size_t nelems, int src_nd, int dst_nd, - ssize_t *packed_shapes_and_strides, + const ssize_t *packed_shapes_and_strides, const char *src_p, char *dst_p, const std::vector &depends) From 9d77fafcc5b6d0975af9ee938acf21597b7ab129 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:56:02 -0600 Subject: [PATCH 10/11] Change to device_allocate_and_pack to return unique_ptr The unique_ptr owns the allocation ensuring no leaks during exception handling. This also allows async_smart_free to be used to schedule asynchronous deallocation of USM temporaries. --- .../libtensor/include/utils/offset_utils.hpp | 22 +- .../tensor/libtensor/source/accumulators.cpp | 35 ++-- .../accumulators/accumulate_over_axis.hpp | 48 ++--- .../source/boolean_advanced_indexing.cpp | 141 +++++-------- dpctl/tensor/libtensor/source/clip.cpp | 19 +- .../source/copy_and_cast_usm_to_usm.cpp | 19 +- .../libtensor/source/copy_as_contig.cpp | 84 +++----- .../libtensor/source/copy_for_reshape.cpp | 20 +- .../tensor/libtensor/source/copy_for_roll.cpp | 41 ++-- .../copy_numpy_ndarray_into_usm_ndarray.cpp | 12 +- .../source/elementwise_functions/add.cpp | 8 +- .../elementwise_functions.hpp | 64 ++---- .../elementwise_functions/true_divide.cpp | 23 +-- dpctl/tensor/libtensor/source/full_ctor.cpp | 21 +- .../source/integer_advanced_indexing.cpp | 189 +++++------------- .../libtensor/source/linalg_functions/dot.cpp | 95 ++++----- .../source/reductions/reduction_over_axis.hpp | 95 +++------ dpctl/tensor/libtensor/source/repeat.cpp | 143 +++++-------- .../libtensor/source/sorting/searchsorted.cpp | 20 +- dpctl/tensor/libtensor/source/triul_ctor.cpp | 13 +- dpctl/tensor/libtensor/source/where.cpp | 25 +-- 21 files changed, 392 insertions(+), 745 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index bd2b67afcb..1438def12a 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -28,10 +28,13 @@ #include #include -#include +#include // for std::make_shared, std::unique_ptr #include +#include // for std::move, std::forward #include +#include + #include "kernels/dpctl_tensor_types.hpp" #include "utils/strided_iters.hpp" #include "utils/sycl_alloc_utils.hpp" @@ -84,7 +87,9 @@ std::vector concat(std::vector lhs, Vs &&...vs) } // namespace detail template -std::tuple +std::tuple, + std::size_t, + sycl::event> device_allocate_and_pack(sycl::queue &q, std::vector &host_task_events, Vs &&...vs) @@ -105,25 +110,24 @@ device_allocate_and_pack(sycl::queue &q, std::make_shared(std::move(packed_shape_strides)); auto sz = packed_shape_strides_owner->size(); - indT *shape_strides = sycl::malloc_device(sz, q); - - if (shape_strides == nullptr) { - return std::make_tuple(shape_strides, 0, sycl::event()); - } + auto shape_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(sz, q); + indT *shape_strides = shape_strides_owner.get(); sycl::event copy_ev = q.copy(packed_shape_strides_owner->data(), shape_strides, sz); sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(copy_ev); - cgh.host_task([packed_shape_strides_owner] { + cgh.host_task([packed_shape_strides_owner = + std::move(packed_shape_strides_owner)] { // increment shared pointer ref-count to keep it alive // till copy operation completes; }); }); host_task_events.push_back(cleanup_host_task_ev); - return std::make_tuple(shape_strides, sz, copy_ev); + return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev); } struct NoOpIndexer diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index 5430fcc58c..9ab2b3c659 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -196,14 +196,11 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, : mask_positions_strided_i64_dispatch_vector[mask_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, compact_shape, compact_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - sycl::event::wait(host_task_events); - throw std::runtime_error("Unexpected error"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { { @@ -212,8 +209,8 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, copy_shape_ev.wait(); sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.release(); } throw std::runtime_error("Unexpected error"); } @@ -233,8 +230,8 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, cumsum_data, host_task_events, dependent_events); sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.release(); } return total_set; @@ -356,14 +353,11 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, compact_shape, compact_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - sycl::event::wait(host_task_events); - throw std::runtime_error("Unexpected error"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { { @@ -371,9 +365,10 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, copy_shape_ev.wait(); sycl::event::wait(host_task_events); + + // ensure USM deleter is called with GIL released + shape_strides_owner.release(); } - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); throw std::runtime_error("Unexpected error"); } @@ -391,8 +386,8 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, py::gil_scoped_release release; sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // ensure USM deleter is called with GIL released + shape_strides_owner.release(); } return total; diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp index bc8306a5a4..2352b6ab30 100644 --- a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -200,18 +200,18 @@ py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_iter_shape, simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, acc_src_strides, acc_dst_strides); - py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple); - if (packed_shapes_and_strides == nullptr) { - throw std::runtime_error("Unexpected error"); - } + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); - py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; - py::ssize_t *acc_shapes_and_strides = + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = packed_shapes_and_strides + 3 * simplified_iter_shape.size(); std::vector all_deps; @@ -224,14 +224,8 @@ py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, acc_shapes_and_strides, dst_data, host_task_events, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(acc_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_and_strides] { - sycl_free_noexcept(packed_shapes_and_strides, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); host_task_events.push_back(temp_cleanup_ev); return std::make_pair( @@ -384,18 +378,18 @@ std::pair py_accumulate_final_axis_include_initial( } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_iter_shape, simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, acc_src_strides, acc_dst_strides); - py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple); - if (packed_shapes_and_strides == nullptr) { - throw std::runtime_error("Unexpected error"); - } + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); - py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; - py::ssize_t *acc_shapes_and_strides = + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = packed_shapes_and_strides + 3 * simplified_iter_shape.size(); std::vector all_deps; @@ -408,14 +402,8 @@ std::pair py_accumulate_final_axis_include_initial( iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, acc_shapes_and_strides, dst_data, host_task_events, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(acc_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_and_strides] { - sycl_free_noexcept(packed_shapes_and_strides, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); host_task_events.push_back(temp_cleanup_ev); return std::make_pair( diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 843474a265..b4ec15c96f 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -35,8 +35,6 @@ #include #include -#include "boolean_advanced_indexing.hpp" -#include "kernels/boolean_advanced_indexing.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" @@ -44,6 +42,9 @@ #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" +#include "boolean_advanced_indexing.hpp" +#include "kernels/boolean_advanced_indexing.hpp" + namespace dpctl { namespace tensor @@ -278,16 +279,14 @@ py_extract(const dpctl::tensor::usm_ndarray &src, [src_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocated device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_src_shape_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -301,14 +300,8 @@ py_extract(const dpctl::tensor::usm_ndarray &src, dst_shape_vec[0], dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(extract_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_src_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } } @@ -370,19 +363,19 @@ py_extract(const dpctl::tensor::usm_ndarray &src, assert(masked_dst_strides.size() == 1); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_ortho_shape, - simplified_ortho_src_strides, simplified_ortho_dst_strides, - masked_src_shape, masked_src_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_src_strides, simplified_ortho_dst_strides, + masked_src_shape, masked_src_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); - py::ssize_t *packed_ortho_src_dst_shape_strides = packed_shapes_strides; - py::ssize_t *packed_masked_src_shape_strides = + const py::ssize_t *packed_ortho_src_dst_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_src_shape_strides = packed_shapes_strides + (3 * ortho_nd); std::vector all_deps; @@ -405,14 +398,8 @@ py_extract(const dpctl::tensor::usm_ndarray &src, masked_dst_shape[0], masked_dst_strides[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(extract_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -601,16 +588,14 @@ py_place(const dpctl::tensor::usm_ndarray &dst, assert(rhs_strides_vec.size() == 1); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, dst_shape_vec, dst_strides_vec); - py::ssize_t *packed_dst_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_dst_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, dst_shape_vec, dst_strides_vec); + auto packed_dst_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_dst_shape_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_dst_shape_strides = + packed_dst_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -624,14 +609,8 @@ py_place(const dpctl::tensor::usm_ndarray &dst, rhs_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(place_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_dst_shape_strides] { - sycl_free_noexcept(packed_dst_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_dst_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } else { @@ -691,19 +670,19 @@ py_place(const dpctl::tensor::usm_ndarray &dst, assert(masked_rhs_strides.size() == 1); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_ortho_shape, - simplified_ortho_dst_strides, simplified_ortho_rhs_strides, - masked_dst_shape, masked_dst_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_dst_strides, simplified_ortho_rhs_strides, + masked_dst_shape, masked_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); - py::ssize_t *packed_ortho_dst_rhs_shape_strides = packed_shapes_strides; - py::ssize_t *packed_masked_dst_shape_strides = + const py::ssize_t *packed_ortho_dst_rhs_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_dst_shape_strides = packed_shapes_strides + (3 * ortho_nd); std::vector all_deps; @@ -724,14 +703,8 @@ py_place(const dpctl::tensor::usm_ndarray &dst, masked_rhs_shape[0], masked_rhs_strides[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(place_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -828,15 +801,12 @@ py_nonzero(const dpctl::tensor::usm_ndarray host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &mask_shape_copying_tuple = - device_allocate_and_pack(exec_q, host_task_events, - mask_shape); - py::ssize_t *src_shape_device_ptr = std::get<0>(mask_shape_copying_tuple); - if (src_shape_device_ptr == nullptr) { - sycl::event::wait(host_task_events); - throw std::runtime_error("Device allocation failed"); - } + auto mask_shape_copying_tuple = device_allocate_and_pack( + exec_q, host_task_events, mask_shape); + auto src_shape_device_owner = + std::move(std::get<0>(mask_shape_copying_tuple)); sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple); + const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -860,14 +830,9 @@ py_nonzero(const dpctl::tensor::usm_ndarray fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(), indexes.get_data(), src_shape_device_ptr, all_deps); - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(non_zero_indexes_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, src_shape_device_ptr] { - sycl_free_noexcept(src_shape_device_ptr, ctx); - }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {non_zero_indexes_ev}, src_shape_device_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp index e381ea7b96..1149e26bd1 100644 --- a/dpctl/tensor/libtensor/source/clip.cpp +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -228,11 +228,10 @@ py_clip(const dpctl::tensor::usm_ndarray &src, // common shape and strides simplified_shape, simplified_src_strides, simplified_min_strides, simplified_max_strides, simplified_dst_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (!packed_shape_strides) { - throw std::runtime_error("USM-host memory allocation failure"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -246,15 +245,9 @@ py_clip(const dpctl::tensor::usm_ndarray &src, min_offset, max_offset, dst_offset, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(clip_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shape_strides, ctx]() { - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {clip_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev = diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 86bb0ac064..4c6946505b 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -251,28 +251,21 @@ copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src, host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides, simplified_dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); const sycl::event ©_and_cast_generic_ev = copy_and_cast_fn( exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, dst_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_and_cast_generic_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); - + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_and_cast_generic_ev}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp index 3eba902e14..04ddef3495 100644 --- a/dpctl/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl/tensor/libtensor/source/copy_as_contig.cpp @@ -222,15 +222,12 @@ py_as_c_contig(const dpctl::tensor::usm_ndarray &src, } std::vector host_task_events{}; - const auto &ptr_size_event_tuple = + auto ptr_size_event_tuple = dpctl::tensor::offset_utils::device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - - py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple); - if (shape_stride == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; @@ -244,14 +241,9 @@ py_as_c_contig(const dpctl::tensor::usm_ndarray &src, ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), dst.get_data(), all_depends); - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); }); - }); - + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -358,15 +350,12 @@ py_as_f_contig(const dpctl::tensor::usm_ndarray &src, } std::vector host_task_events{}; - const auto &ptr_size_event_tuple = + auto ptr_size_event_tuple = dpctl::tensor::offset_utils::device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - - py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple); - if (shape_stride == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; @@ -380,14 +369,9 @@ py_as_f_contig(const dpctl::tensor::usm_ndarray &src, ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), dst.get_data(), all_depends); - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); }); - }); - + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -551,13 +535,12 @@ py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (nullptr == packed_shape_strides) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_depends; all_depends.reserve(depends.size() + 1); @@ -571,15 +554,9 @@ py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, dst_strides_vec[src_nd - 2], all_depends); // async free of shape_strides temporary - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - - cgh.host_task([ctx, packed_shape_strides]() { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -737,13 +714,12 @@ py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (nullptr == packed_shape_strides) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_depends; all_depends.reserve(depends.size() + 1); @@ -756,16 +732,10 @@ py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, n, src.get_data(), src_strides_vec.front(), dst.get_data(), dst_strides_vec[1], all_depends); - // async free of shape_strides temporary - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - - cgh.host_task([ctx, packed_shape_strides]() { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); + // async free of shape_strides + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp index 8fee94dcb0..eb404a4543 100644 --- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp @@ -133,14 +133,12 @@ copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, // shape_strides = [src_shape, src_strides, dst_shape, dst_strides] using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, src_shape, src_strides, dst_shape, dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } - sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const py::ssize_t *shape_strides = shape_strides_owner.get(); const char *src_data = src.get_data(); char *dst_data = dst.get_data(); @@ -153,13 +151,9 @@ copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data, dst_data, all_deps); - auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_for_reshape_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [shape_strides, ctx]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_reshape_event}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp index 4d72df907b..ef63060b4f 100644 --- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp @@ -218,15 +218,12 @@ copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, // shape_strides = [src_shape, src_strides, dst_strides] using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides, simplified_dst_strides); - - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); std::vector all_deps(depends.size() + 1); all_deps.push_back(copy_shape_ev); @@ -236,14 +233,9 @@ copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data, src_offset, dst_data, dst_offset, all_deps); - auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_for_roll_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [shape_strides, ctx]() { sycl_free_noexcept(shape_strides, ctx); }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -349,15 +341,13 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, // shape_strides = [src_shape, src_strides, dst_strides] using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, common_shape, src_strides, dst_strides, normalized_shifts); - - py::ssize_t *shape_strides_shifts = std::get<0>(ptr_size_event_tuple); - if (shape_strides_shifts == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_shifts_owner = + std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get(); std::vector all_deps(depends.size() + 1); all_deps.push_back(copy_shape_ev); @@ -367,15 +357,8 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data, src_offset, dst_data, dst_offset, all_deps); - auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_for_roll_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([shape_strides_shifts, ctx]() { - sycl_free_noexcept(shape_strides_shifts, ctx); - }); - }); - + auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_shifts_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index 23a392397f..9b7894eb4c 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -303,14 +303,12 @@ void copy_numpy_ndarray_into_usm_ndarray( // Copy shape strides into device memory using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides, simplified_dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); { // release GIL for the blocking call @@ -326,8 +324,8 @@ void copy_numpy_ndarray_into_usm_ndarray( npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, dst_offset, depends, {copy_shape_ev}); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // invoke USM deleter in smart pointer while GIL is held + shape_strides_owner.release(); } return; diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp index 9133b2bc26..31a0b7f053 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp @@ -30,13 +30,13 @@ #include #include -#include "add.hpp" -#include "elementwise_functions.hpp" -#include "utils/type_dispatch.hpp" - #include "kernels/elementwise_functions/add.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" +#include "utils/type_dispatch.hpp" + +#include "add.hpp" +#include "elementwise_functions.hpp" namespace py = pybind11; diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 7339597d73..c046321006 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -223,28 +223,21 @@ py_unary_ufunc(const dpctl::tensor::usm_ndarray &src, std::vector host_tasks{}; host_tasks.reserve(2); - const auto &ptr_size_event_triple_ = device_allocate_and_pack( + auto ptr_size_event_triple_ = device_allocate_and_pack( q, host_tasks, simplified_shape, simplified_src_strides, simplified_dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_triple_); - const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_triple_); - - if (shape_strides == nullptr) { - throw std::runtime_error("Device memory allocation failed"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_)); + const auto ©_shape_ev = std::get<2>(ptr_size_event_triple_); + const py::ssize_t *shape_strides = shape_strides_owner.get(); sycl::event strided_fn_ev = strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, dst_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - auto ctx = q.get_context(); - sycl::event tmp_cleanup_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(strided_fn_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + q, {strided_fn_ev}, shape_strides_owner); + host_tasks.push_back(tmp_cleanup_ev); return std::make_pair( @@ -548,31 +541,21 @@ std::pair py_binary_ufunc( } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_sz_event_triple_ = device_allocate_and_pack( + auto ptr_sz_event_triple_ = device_allocate_and_pack( exec_q, host_tasks, simplified_shape, simplified_src1_strides, simplified_src2_strides, simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto ©_shape_ev = std::get<2>(ptr_sz_event_triple_); - py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_); - const sycl::event ©_shape_ev = std::get<2>(ptr_sz_event_triple_); - - if (shape_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } + const py::ssize_t *shape_strides = shape_strides_owner.get(); sycl::event strided_fn_ev = strided_fn( exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset, src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - auto ctx = exec_q.get_context(); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(strided_fn_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); - + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {strided_fn_ev}, shape_strides_owner); host_tasks.push_back(tmp_cleanup_ev); return std::make_pair( @@ -802,30 +785,21 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_sz_event_triple_ = device_allocate_and_pack( + auto ptr_sz_event_triple_ = device_allocate_and_pack( exec_q, host_tasks, simplified_shape, simplified_rhs_strides, simplified_lhs_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto copy_shape_ev = std::get<2>(ptr_sz_event_triple_); - py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_); - const sycl::event ©_shape_ev = std::get<2>(ptr_sz_event_triple_); - - if (shape_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } + const py::ssize_t *shape_strides = shape_strides_owner.get(); sycl::event strided_fn_ev = strided_fn(exec_q, rhs_nelems, nd, shape_strides, rhs_data, rhs_offset, lhs_data, lhs_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - auto ctx = exec_q.get_context(); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(strided_fn_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {strided_fn_ev}, shape_strides_owner); host_tasks.push_back(tmp_cleanup_ev); diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp index 9b11aa022e..0e3fb38015 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -28,7 +28,7 @@ #include #include #include -#include +#include // for std::ignore #include #include "dpctl4pybind11.hpp" @@ -379,12 +379,13 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_sz_event_triple_ = device_allocate_and_pack( + auto ptr_sz_event_triple_ = device_allocate_and_pack( exec_q, host_tasks, simplified_shape, simplified_src_strides, simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto ©_metadata_ev = std::get<2>(ptr_sz_event_triple_); - py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_); - const sycl::event ©_metadata_ev = std::get<2>(ptr_sz_event_triple_); + const py::ssize_t *shape_strides = shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -392,23 +393,13 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, std::copy(depends.begin(), depends.end(), all_deps.begin()); all_deps.push_back(copy_metadata_ev); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } - sycl::event div_ev = fn(exec_q, src_nelems, nd, shape_strides, src_data, src_offset, scalar_alloc, dst_data, dst_offset, all_deps); // async free of shape_strides temporary - auto ctx = exec_q.get_context(); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(div_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {div_ev}, shape_strides_owner); host_tasks.push_back(tmp_cleanup_ev); diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index 393455b23f..fe668422a0 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -261,29 +261,20 @@ usm_ndarray_full(const py::object &py_value, std::vector host_task_events; host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = - device_allocate_and_pack(exec_q, host_task_events, - dst_shape, dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, dst_shape, dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + py::ssize_t *shape_strides = shape_strides_owner.get(); const sycl::event &full_strided_ev = fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data, {copy_shape_ev}); // free shape_strides - const auto &ctx = exec_q.get_context(); const auto &temporaries_cleanup_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(full_strided_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, shape_strides]() { - sycl_free_noexcept(shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {full_strided_ev}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index f1790ec6be..5eb54bbe70 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -193,9 +193,12 @@ _populate_kernel_params(sycl::queue &exec_q, device_orthog_sh_st_copy_ev, device_ind_offsets_copy_ev, device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev}); - cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp, - host_ind_ptrs_shp, host_orthog_sh_st_shp, - host_along_sh_st_shp]() {}); + cgh.host_task( + [host_ind_offsets_shp = std::move(host_ind_offsets_shp), + host_ind_sh_st_shp = std::move(host_ind_sh_st_shp), + host_ind_ptrs_shp = std::move(host_ind_ptrs_shp), + host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp), + host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {}); }); host_task_events.push_back(shared_ptr_cleanup_ev); @@ -424,38 +427,24 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, return std::make_pair(sycl::event{}, sycl::event{}); } - char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); - - if (packed_ind_ptrs == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_ptrs device memory"); - } + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); // rearrange to past where indices shapes are checked // packed_ind_shapes_strides = [ind_shape, // ind[0] strides, // ..., // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); py::ssize_t *packed_ind_shapes_strides = - sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); - - if (packed_ind_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_shapes_strides device memory"); - } - - py::ssize_t *packed_ind_offsets = - sycl::malloc_device(k, exec_q); + packed_ind_shapes_strides_owner.get(); - if (packed_ind_offsets == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_offsets device memory"); - } + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); int orthog_sh_elems = std::max(src_nd - k, 1); @@ -463,34 +452,20 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, // src_strides[:axis] + src_strides[axis+k:], // dst_strides[:axis] + // dst_strides[axis+ind.ndim:]] - py::ssize_t *packed_shapes_strides = - sycl::malloc_device(3 * orthog_sh_elems, exec_q); - - if (packed_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - throw std::runtime_error( - "Unable to allocate packed_shapes_strides device memory"); - } + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); // packed_axes_shapes_strides = [src_shape[axis:axis+k], // src_strides[axis:axis+k], // dst_shape[axis:axis+ind.ndim], // dst_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); py::ssize_t *packed_axes_shapes_strides = - sycl::malloc_device(2 * (k + ind_sh_elems), exec_q); - - if (packed_axes_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_axes_shapes_strides device memory"); - } + packed_axes_shapes_strides_owner.get(); auto src_strides = src.get_strides_vector(); auto dst_strides = dst.get_strides_vector(); @@ -515,12 +490,6 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, if (fn == nullptr) { sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - sycl_free_noexcept(packed_axes_shapes_strides, exec_q); throw std::runtime_error("Indices must be integer type, got " + std::to_string(ind_type_id)); } @@ -532,21 +501,11 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, src_offset, dst_offset, packed_ind_offsets, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(take_generic_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, - packed_ind_shapes_strides, packed_ind_ptrs, - packed_ind_offsets, ctx]() { - sycl_free_noexcept(packed_shapes_strides, ctx); - sycl_free_noexcept(packed_axes_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_ptrs, ctx); - sycl_free_noexcept(packed_ind_offsets, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {take_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev = @@ -738,37 +697,23 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, return std::make_pair(sycl::event{}, sycl::event{}); } - char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); - - if (packed_ind_ptrs == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_ptrs device memory"); - } + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); // packed_ind_shapes_strides = [ind_shape, // ind[0] strides, // ..., // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); py::ssize_t *packed_ind_shapes_strides = - sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); - - if (packed_ind_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_shapes_strides device memory"); - } - - py::ssize_t *packed_ind_offsets = - sycl::malloc_device(k, exec_q); + packed_ind_shapes_strides_owner.get(); - if (packed_ind_offsets == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_offsets device memory"); - } + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); int orthog_sh_elems = std::max(dst_nd - k, 1); @@ -776,34 +721,20 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, // dst_strides[:axis] + dst_strides[axis+k:], // val_strides[:axis] + // val_strides[axis+ind.ndim:]] - py::ssize_t *packed_shapes_strides = - sycl::malloc_device(3 * orthog_sh_elems, exec_q); - - if (packed_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - throw std::runtime_error( - "Unable to allocate packed_shapes_strides device memory"); - } + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); // packed_axes_shapes_strides = [dst_shape[axis:axis+k], // dst_strides[axis:axis+k], // val_shape[axis:axis+ind.ndim], // val_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); py::ssize_t *packed_axes_shapes_strides = - sycl::malloc_device(2 * (k + ind_sh_elems), exec_q); - - if (packed_axes_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_axes_shapes_strides device memory"); - } + packed_axes_shapes_strides_owner.get(); auto dst_strides = dst.get_strides_vector(); auto val_strides = val.get_strides_vector(); @@ -828,12 +759,6 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, if (fn == nullptr) { sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - sycl_free_noexcept(packed_axes_shapes_strides, exec_q); throw std::runtime_error("Indices must be integer type, got " + std::to_string(ind_type_id)); } @@ -845,21 +770,11 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, dst_offset, val_offset, packed_ind_offsets, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(put_generic_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, - packed_ind_shapes_strides, packed_ind_ptrs, - packed_ind_offsets, ctx]() { - sycl_free_noexcept(packed_shapes_strides, ctx); - sycl_free_noexcept(packed_axes_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_ptrs, ctx); - sycl_free_noexcept(packed_ind_offsets, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {put_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev = diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp index bb79a19789..ce267baa1b 100644 --- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp +++ b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp @@ -477,7 +477,7 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -486,16 +486,14 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, // reduction metadata simplified_inner_shape, simplified_inner_x1_strides, simplified_inner_x2_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_alloc_owner = + std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *inner_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *inner_shape_stride = temp_allocation_ptr + 4 * simplified_batch_shape.size(); std::vector all_deps; @@ -511,14 +509,9 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, inner_nd, // number dimensions being reduced inner_shape_stride, inner_x1_offset, inner_x2_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dot_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {dot_ev}, + tmp_alloc_owner); host_task_events.push_back(temp_cleanup_ev); } else { // if (!call_vecdot) @@ -557,18 +550,16 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, } } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, x1_shape_vec, x1_strides_vec, - x2_shape_vec, x2_strides_vec, dst_shape_vec, - dst_strides_vec); - py::ssize_t *packed_shapes_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, x1_shape_vec, x1_strides_vec, + x2_shape_vec, x2_strides_vec, dst_shape_vec, dst_strides_vec); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + const py::ssize_t *x1_shape_strides = packed_shapes_strides; const py::ssize_t *x2_shape_strides = packed_shapes_strides + 2 * (x1_nd); @@ -588,14 +579,8 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, x1_outer_dims + x2_outer_dims, dst_shape_strides, all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dot_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dot_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } else { // if (call_batched) @@ -751,25 +736,23 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, } } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_batch_shape, - simplified_batch_x1_strides, simplified_batch_x2_strides, - simplified_batch_dst_strides, outer_inner_x1_shape, - outer_inner_x1_strides, outer_inner_x2_shape, - outer_inner_x2_strides, outer_inner_dst_shape, - outer_inner_dst_strides, - // full shape and strides of the result array - // necessary for reduction and initialization - simplified_batch_shape, outer_inner_dst_shape, - simplified_batch_dst_strides, outer_inner_dst_strides); - py::ssize_t *packed_shapes_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_batch_shape, + simplified_batch_x1_strides, simplified_batch_x2_strides, + simplified_batch_dst_strides, outer_inner_x1_shape, + outer_inner_x1_strides, outer_inner_x2_shape, + outer_inner_x2_strides, outer_inner_dst_shape, + outer_inner_dst_strides, + // full shape and strides of the result array + // necessary for reduction and initialization + simplified_batch_shape, outer_inner_dst_shape, + simplified_batch_dst_strides, outer_inner_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); const auto batch_shape_strides = packed_shapes_strides; const auto x1_outer_inner_shapes_strides = @@ -799,14 +782,8 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, dst_outer_shapes_strides, dst_full_shape_strides, all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dot_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dot_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } } diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index f5d70e4f2a..9458374482 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -459,7 +459,7 @@ std::pair py_reduction_over_axis( std::vector host_task_events{}; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -467,15 +467,13 @@ std::pair py_reduction_over_axis( simplified_iteration_dst_strides, // reduction metadata simplified_reduction_shape, simplified_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_alloc_owner = + std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = temp_allocation_ptr + 3 * simplified_iteration_shape.size(); std::vector all_deps; @@ -491,14 +489,8 @@ std::pair py_reduction_over_axis( reduction_nd, // number dimensions being reduced reduction_shape_stride, reduction_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(reduction_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_alloc_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = @@ -750,7 +742,7 @@ std::pair py_tree_reduction_over_axis( std::vector host_task_events{}; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -758,15 +750,12 @@ std::pair py_tree_reduction_over_axis( simplified_iteration_dst_strides, // reduction metadata simplified_reduction_shape, simplified_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = temp_allocation_ptr + 3 * simplified_iteration_shape.size(); std::vector all_deps; @@ -782,14 +771,8 @@ std::pair py_tree_reduction_over_axis( reduction_nd, // number dimensions being reduced reduction_shape_stride, reduction_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(reduction_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = @@ -1032,7 +1015,7 @@ std::pair py_search_over_axis( using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -1040,15 +1023,12 @@ std::pair py_search_over_axis( simplified_iteration_dst_strides, // reduction metadata compact_reduction_shape, compact_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = temp_allocation_ptr + 3 * simplified_iteration_shape.size(); std::vector all_deps; @@ -1063,14 +1043,8 @@ std::pair py_search_over_axis( reduction_nd, // number dimensions being reduced reduction_shape_stride, reduction_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, tmp_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = @@ -1301,21 +1275,20 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, auto fn = strided_dispatch_vector[src_typeid]; std::vector host_task_events{}; - const auto &iter_red_metadata_packing_triple_ = + auto iter_red_metadata_packing_triple_ = dpctl::tensor::offset_utils::device_allocate_and_pack( exec_q, host_task_events, simplified_iter_shape, simplified_iter_src_strides, simplified_iter_dst_strides, simplified_red_shape, simplified_red_src_strides); - py::ssize_t *packed_shapes_and_strides = - std::get<0>(iter_red_metadata_packing_triple_); - if (packed_shapes_and_strides == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto packed_shapes_strides_owner = + std::move(std::get<0>(iter_red_metadata_packing_triple_)); const auto ©_metadata_ev = std::get<2>(iter_red_metadata_packing_triple_); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_strides_owner.get(); - py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; - py::ssize_t *red_shape_stride = + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *red_shape_stride = packed_shapes_and_strides + 3 * simplified_iter_shape.size(); std::vector all_deps; @@ -1329,14 +1302,8 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, iter_shape_and_strides, iter_src_offset, iter_dst_offset, simplified_red_nd, red_shape_stride, red_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_and_strides] { - sycl_free_noexcept(packed_shapes_and_strides, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {red_ev}, packed_shapes_strides_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index f0df192876..25146eac88 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -35,13 +35,14 @@ #include #include "kernels/repeat.hpp" -#include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" +#include "simplify_iteration_space.hpp" + namespace dpctl { namespace tensor @@ -239,15 +240,13 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -263,14 +262,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } else { @@ -318,15 +311,14 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, orthog_dst_offset); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_orthog_shape, - simplified_orthog_src_strides, simplified_orthog_dst_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -348,14 +340,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, reps_shape_vec[0], reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -475,13 +461,13 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + auto ptr_size_event_tuple1 = device_allocate_and_pack( exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_src_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_src_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shapes_strides = + packed_src_shapes_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -496,14 +482,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, reps_shape_vec[0], reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shapes_strides] { - sycl_free_noexcept(packed_src_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( @@ -617,15 +597,13 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -639,14 +617,9 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); } else { @@ -695,15 +668,14 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, orthog_dst_offset); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_orthog_shape, - simplified_orthog_src_strides, simplified_orthog_dst_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -723,14 +695,8 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, axis_dst_shape[0], axis_dst_stride[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -814,13 +780,13 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + auto ptr_size_event_tuple1 = device_allocate_and_pack( exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -834,15 +800,8 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, dst_shape_vec[0], dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); - + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); sycl::event py_obj_management_host_task_ev = diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp index 76cb41595d..174214e4c9 100644 --- a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp +++ b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp @@ -376,14 +376,11 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, // vectors being packed simplified_common_shape, simplified_needles_strides, simplified_positions_strides); - - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_strides_ev = std::get<2>(ptr_size_event_tuple); - - if (!packed_shape_strides) { - throw std::runtime_error("USM-host allocation failure"); - } + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -411,14 +408,9 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, simplified_nd, packed_shape_strides, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shape_strides, ctx]() { - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); const sycl::event &ht_ev = dpctl::utils::keep_args_alive( diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp index 264de8f36d..24bf7b322f 100644 --- a/dpctl/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp @@ -176,11 +176,11 @@ usm_ndarray_triul(sycl::queue &exec_q, (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; - py::ssize_t *dev_shape_and_strides = - sycl::malloc_device(3 * nd, exec_q); - if (dev_shape_and_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } + auto dev_shape_and_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(3 * nd, + exec_q); + py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get(); + const sycl::event ©_shape_and_strides = exec_q.copy( shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); @@ -212,6 +212,9 @@ usm_ndarray_triul(sycl::queue &exec_q, sycl_free_noexcept(dev_shape_and_strides, ctx); }); }); + // since host_task now owns USM allocation, release ownership by smart + // pointer + dev_shape_and_strides_owner.release(); return std::make_pair( keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev); diff --git a/dpctl/tensor/libtensor/source/where.cpp b/dpctl/tensor/libtensor/source/where.cpp index 3a50eb309c..9825b65901 100644 --- a/dpctl/tensor/libtensor/source/where.cpp +++ b/dpctl/tensor/libtensor/source/where.cpp @@ -27,21 +27,23 @@ #include #include #include -#include #include +#include + #include "dpctl4pybind11.hpp" #include #include #include #include "kernels/where.hpp" -#include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" #include "where.hpp" namespace dpctl @@ -211,11 +213,10 @@ py_where(const dpctl::tensor::usm_ndarray &condition, // common shape and strides simplified_shape, simplified_cond_strides, simplified_x1_strides, simplified_x2_strides, simplified_dst_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (!packed_shape_strides) { - throw std::runtime_error("USM-host memory allocation failure"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -229,15 +230,9 @@ py_where(const dpctl::tensor::usm_ndarray &condition, x1_offset, x2_offset, dst_offset, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(where_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shape_strides, ctx]() { - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {where_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev = From 065413e947d86f37c5bacb447e28b17cb42abfbf Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:35:02 -0600 Subject: [PATCH 11/11] Merge _sorting_impl and _sorting_radix_impl modules Doing so reduces the binary size. Previously, '_tensor_sorting_impl' module has size 22'448'920 bytes, and '_tensor_sorting_radix_impl' has size 31'927'256 bytes. Total size was 54'376'176 bytes. After this change, the total size of the new '_tensor_sorting_impl' is 49'790'872, which is about 4Mb of savings. --- dpctl/tensor/CMakeLists.txt | 16 ++-------------- dpctl/tensor/_sorting.py | 8 +++----- dpctl/tensor/libtensor/source/sorting/topk.cpp | 5 +++-- dpctl/tensor/libtensor/source/tensor_sorting.cpp | 4 ++++ 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 75976f1b1f..e7d3896680 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -114,12 +114,10 @@ set(_reduction_sources set(_sorting_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp -) -set(_sorting_radix_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp ) set(_static_lib_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp @@ -156,10 +154,6 @@ set(_tensor_sorting_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp ${_sorting_sources} ) -set(_tensor_sorting_radix_impl_sources - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting_radix.cpp - ${_sorting_radix_sources} -) set(_linalg_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linalg_functions/dot.cpp @@ -214,12 +208,6 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_s target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) list(APPEND _py_trgts ${python_module_name}) -set(python_module_name _tensor_sorting_radix_impl) -pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_radix_impl_sources}) -add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_radix_impl_sources}) -target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) -list(APPEND _py_trgts ${python_module_name}) - set(python_module_name _tensor_linalg_impl) pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources}) diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index 9354e383db..8ac623da38 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -25,16 +25,14 @@ from ._tensor_sorting_impl import ( _argsort_ascending, _argsort_descending, - _sort_ascending, - _sort_descending, - _topk, -) -from ._tensor_sorting_radix_impl import ( _radix_argsort_ascending, _radix_argsort_descending, _radix_sort_ascending, _radix_sort_descending, _radix_sort_dtype_supported, + _sort_ascending, + _sort_descending, + _topk, ) __all__ = ["sort", "argsort"] diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp index 84108fd6cd..caef97f4c4 100644 --- a/dpctl/tensor/libtensor/source/sorting/topk.cpp +++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp @@ -31,18 +31,19 @@ #include #include +#include + #include "dpctl4pybind11.hpp" #include #include -#include +#include "kernels/sorting/topk.hpp" #include "utils/math_utils.hpp" #include "utils/memory_overlap.hpp" #include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" -#include "kernels/sorting/topk.hpp" #include "rich_comparisons.hpp" #include "topk.hpp" diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp index 7ed908c38b..1a264c780b 100644 --- a/dpctl/tensor/libtensor/source/tensor_sorting.cpp +++ b/dpctl/tensor/libtensor/source/tensor_sorting.cpp @@ -27,6 +27,8 @@ #include "sorting/merge_argsort.hpp" #include "sorting/merge_sort.hpp" +#include "sorting/radix_argsort.hpp" +#include "sorting/radix_sort.hpp" #include "sorting/searchsorted.hpp" #include "sorting/topk.hpp" @@ -37,5 +39,7 @@ PYBIND11_MODULE(_tensor_sorting_impl, m) dpctl::tensor::py_internal::init_merge_sort_functions(m); dpctl::tensor::py_internal::init_merge_argsort_functions(m); dpctl::tensor::py_internal::init_searchsorted_functions(m); + dpctl::tensor::py_internal::init_radix_sort_functions(m); + dpctl::tensor::py_internal::init_radix_argsort_functions(m); dpctl::tensor::py_internal::init_topk_functions(m); }