Adds simple _divide_by_scalar to _tensor_elementwise_impl

ndgrigorian · ndgrigorian · commit 361d03d0fbdb · 2024-08-30T15:29:15.000-07:00
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -84,6 +84,11 @@ struct TrueDivideFunctor
 
             return in1 / exprm_ns::complex<realT2>(in2);
         }
+        else if constexpr (std::is_floating_point_v<argT1> &&
+                           std::is_integral_v<argT2>)
+        {
+            return in1 / static_cast<argT1>(in2);
+        }
         else {
             return in1 / in2;
         }
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
@@ -24,14 +24,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
+#include <cstdint>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <sycl/sycl.hpp>
 #include <vector>
 
 #include "elementwise_functions.hpp"
+#include "simplify_iteration_space.hpp"
 #include "true_divide.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
 #include "utils/type_dispatch.hpp"
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -165,6 +170,204 @@ void populate_true_divide_dispatch_tables(void)
     dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table);
 };
 
+template <typename T> class divide_by_scalar_krn;
+
+typedef sycl::event (*divide_by_scalar_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    py::ssize_t,
+    std::int64_t,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event divide_by_scalar(sycl::queue &exec_q,
+                             size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             py::ssize_t arg_offset,
+                             std::int64_t scalar,
+                             char *res_p,
+                             py::ssize_t res_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using BinOpT = dpctl::tensor::kernels::true_divide::TrueDivideFunctor<
+            T, std::int64_t, T>;
+
+        auto op = BinOpT();
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT two_offsets_indexer{nd, arg_offset, res_offset,
+                                           shape_and_strides};
+
+        const T *arg_tp = reinterpret_cast<const T *>(arg_p);
+        T *res_tp = reinterpret_cast<T *>(res_p);
+
+        cgh.parallel_for<divide_by_scalar_krn<T>>(
+            {nelems}, [=](sycl::id<1> id) {
+                const auto &two_offsets_ =
+                    two_offsets_indexer(static_cast<ssize_t>(id.get(0)));
+
+                const auto &arg_i = two_offsets_.get_first_offset();
+                const auto &res_i = two_offsets_.get_second_offset();
+                res_tp[res_i] = op(arg_tp[arg_i], scalar);
+            });
+    });
+    return comp_ev;
+}
+
+std::pair<sycl::event, sycl::event>
+py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                    const std::int64_t scalar,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends = {})
+{
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != src.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    size_t src_nelems(1);
+
+    for (int i = 0; i < dst_nd; ++i) {
+        src_nelems *= static_cast<size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(src, dst) && !same_logical_tensors(src, dst))) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    constexpr int float16_typeid = static_cast<int>(td_ns::typenum_t::HALF);
+    constexpr int float32_typeid = static_cast<int>(td_ns::typenum_t::FLOAT);
+    constexpr int float64_typeid = static_cast<int>(td_ns::typenum_t::DOUBLE);
+
+    divide_by_scalar_fn_ptr_t fn;
+    switch (src_typeid) {
+    case float16_typeid:
+        fn = divide_by_scalar<sycl::half>;
+        break;
+    case float32_typeid:
+        fn = divide_by_scalar<float>;
+        break;
+    case float64_typeid:
+        fn = divide_by_scalar<double>;
+        break;
+    default:
+        throw std::runtime_error("Implementation is missing for typeid=" +
+                                 std::to_string(src_typeid));
+    }
+
+    // simplify strides
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = dst_nd;
+    const py::ssize_t *shape = src_shape;
+
+    std::vector<sycl::event> host_tasks{};
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_tasks, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+
+    py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_);
+    const sycl::event &copy_metadata_ev = std::get<2>(ptr_sz_event_triple_);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    if (shape_strides == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+
+    sycl::event div_ev = fn(exec_q, src_nelems, nd, shape_strides, src_data,
+                            src_offset, scalar, dst_data, dst_offset, all_deps);
+
+    // async free of shape_strides temporary
+    auto ctx = exec_q.get_context();
+
+    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(div_ev);
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task(
+            [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); });
+    });
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_tasks), div_ev);
+}
+
 } // namespace impl
 
 void init_divide(py::module_ m)
@@ -233,6 +436,11 @@ void init_divide(py::module_ m)
         m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"),
               py::arg("rhs"), py::arg("sycl_queue"),
               py::arg("depends") = py::list());
+
+        using impl::py_divide_by_scalar;
+        m.def("_divide_by_scalar", &py_divide_by_scalar, "", py::arg("src"),
+              py::arg("scalar"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,11 @@ struct TrueDivideFunctor`
`84`	`84`
`85`	`85`	`return in1 / exprm_ns::complex<realT2>(in2);`
`86`	`86`	`}`
	`87`	`+ else if constexpr (std::is_floating_point_v<argT1> &&`
	`88`	`+ std::is_integral_v<argT2>)`
	`89`	`+ {`
	`90`	`+ return in1 / static_cast<argT1>(in2);`
	`91`	`+ }`
`87`	`92`	`else {`
`88`	`93`	`return in1 / in2;`
`89`	`94`	`}`