diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index fe7f85be09..6639c09f9b 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -158,6 +158,17 @@ set(_tensor_linalg_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp ${_linalg_sources} ) +set(_accumulator_sources +${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp +${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp +${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp +${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp +) +set(_tensor_accumulation_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${_accumulator_sources} +) set(_py_trgts) @@ -186,6 +197,11 @@ pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources}) list(APPEND _py_trgts ${python_module_name}) +set(python_module_name _tensor_accumulation_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") @@ -203,6 +219,7 @@ list(APPEND _no_fast_math_sources ${_reduction_sources} ${_sorting_sources} ${_linalg_sources} + ${_accumulator_sources} ) foreach(_src_fn ${_no_fast_math_sources}) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 6652243128..414333bf5c 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -96,6 +96,7 @@ from dpctl.tensor._usmarray import usm_ndarray from dpctl.tensor._utility_functions import all, any +from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._array_api import __array_api_version__, __array_namespace_info__ from ._clip import clip from ._constants import e, inf, nan, newaxis, pi @@ -367,4 +368,7 @@ "tensordot", "vecdot", "searchsorted", + "cumulative_logsumexp", + "cumulative_prod", + "cumulative_sum", ] diff --git a/dpctl/tensor/_accumulation.py b/dpctl/tensor/_accumulation.py new file mode 100644 index 0000000000..5232740b4f --- /dev/null +++ b/dpctl/tensor/_accumulation.py @@ -0,0 +1,469 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numpy.core.numeric import normalize_axis_index + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_accumulation_impl as tai +import dpctl.tensor._tensor_impl as ti +from dpctl.tensor._type_utils import _to_device_supported_dtype +from dpctl.utils import ExecutionPlacementError + + +def _default_accumulation_dtype(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when accumulation is performed on queue `q` + """ + inp_kind = inp_dt.kind + if inp_kind in "bi": + res_dt = dpt.dtype(ti.default_device_int_type(q)) + if inp_dt.itemsize > res_dt.itemsize: + res_dt = inp_dt + elif inp_kind in "u": + res_dt = dpt.dtype(ti.default_device_int_type(q).upper()) + res_ii = dpt.iinfo(res_dt) + inp_ii = dpt.iinfo(inp_dt) + if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max: + pass + else: + res_dt = inp_dt + elif inp_kind in "fc": + res_dt = inp_dt + + return res_dt + + +def _default_accumulation_dtype_fp_types(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when accumulation is performed on queue `q` + and the accumulation supports only floating-point data types + """ + inp_kind = inp_dt.kind + if inp_kind in "biu": + res_dt = dpt.dtype(ti.default_device_fp_type(q)) + can_cast_v = dpt.can_cast(inp_dt, res_dt) + if not can_cast_v: + _fp64 = q.sycl_device.has_aspect_fp64 + res_dt = dpt.float64 if _fp64 else dpt.float32 + elif inp_kind in "f": + res_dt = inp_dt + elif inp_kind in "c": + raise ValueError("function not defined for complex types") + + return res_dt + + +def _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + _accumulate_fn, + _accumulate_include_initial_fn, + _dtype_supported, + _default_accumulation_type_fn, +): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + appended_axis = False + if x.ndim == 0: + x = x[dpt.newaxis] + appended_axis = True + nd = x.ndim + if axis is None: + if nd > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format(nd) + ) + axis = 0 + else: + axis = normalize_axis_index(axis, nd, "axis") + sh = x.shape + res_sh = ( + sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh + ) + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + q = x.sycl_queue + inp_dt = x.dtype + res_usm_type = x.usm_type + if dtype is None: + res_dt = _default_accumulation_type_fn(inp_dt, q) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) + + # checking now avoids unnecessary allocations + implemented_types = _dtype_supported(inp_dt, res_dt) + if dtype is None and not implemented_types: + raise RuntimeError( + "Automatically determined accumulation data type does not " + "have direct implementation" + ) + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + out_sh = out.shape + # append an axis to `out` if scalar + if appended_axis and not include_initial: + out = out[dpt.newaxis, ...] + orig_out = out + final_res_sh = res_sh[1:] + else: + final_res_sh = res_sh + if not out_sh == final_res_sh: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_sh}, got {out_sh}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " f"got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + # permute out array dims if necessary + if a1 != nd: + out = dpt.permute_dims(out, perm) + orig_out = out + if ti._array_overlap(x, out) and implemented_types: + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + if a1 != nd: + out = dpt.permute_dims(out, perm) + + host_tasks_list = [] + if implemented_types: + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=arr, + trailing_dims_to_accumulate=1, + dst=out, + sycl_queue=q, + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=arr, + dst=out, + sycl_queue=q, + ) + host_tasks_list.append(ht_e) + if not (orig_out is None or out is orig_out): + # Copy the out data from temporary buffer to original memory + ht_e_cpy, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev] + ) + host_tasks_list.append(ht_e_cpy) + out = orig_out + else: + if _dtype_supported(res_dt, res_dt): + tmp = dpt.empty( + arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=tmp, + trailing_dims_to_accumulate=1, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=tmp, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + else: + buf_dt = _default_accumulation_type_fn(inp_dt, q) + tmp = dpt.empty( + arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q + ) + tmp_res = dpt.empty( + res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + if a1 != nd: + tmp_res = dpt.permute_dims(tmp_res, perm) + host_tasks_list.append(ht_e_cpy) + if not include_initial: + ht_e, a_e = _accumulate_fn( + src=tmp, + trailing_dims_to_accumulate=1, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + else: + ht_e, a_e = _accumulate_include_initial_fn( + src=tmp, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + host_tasks_list.append(ht_e) + ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp_res, dst=out, sycl_queue=q, depends=[a_e] + ) + host_tasks_list.append(ht_e_cpy2) + + if appended_axis: + out = dpt.squeeze(out) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt.permute_dims(out, inv_perm) + dpctl.SyclEvent.wait_for(host_tasks_list) + + return out + + +def cumulative_sum( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative sum of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis along which cumulative sum must be computed. + If `None`, the sum is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative sum. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided axis + in the output. Default: `False`. + + Returns: + usm_ndarray: + an array containing cumulative sums. The returned array has the data + type as described in the `dtype` parameter description above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative sum is calculated, which will have size `N+1` + + where `N` is the size of the axis the cumulative sums are computed + along. + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumsum_over_axis, + tai._cumsum_final_axis_include_initial, + tai._cumsum_dtype_supported, + _default_accumulation_dtype, + ) + + +def cumulative_prod( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative product of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis along which cumulative product must be computed. + If `None`, the product is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative product. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided + axis in the output. Default: `False`. + + Returns: + usm_ndarray: + an array containing cumulative products. The returned array has + the data type as described in the `dtype` parameter description + above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative product is calculated, which will have size `N+1` + + where `N` is the size of the axis the cumulative products are + computed along. + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumprod_over_axis, + tai._cumprod_final_axis_include_initial, + tai._cumprod_dtype_supported, + _default_accumulation_dtype, + ) + + +def cumulative_logsumexp( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative logsmumexp of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis along which cumulative logsumexp must be computed. + If `None`, the logsumexp is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative logsumexp. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided axis + in the output. Default: `False`. + + Returns: + usm_ndarray: + an array containing cumulative logsumexp results. The returned + array has the data type as described in the `dtype` parameter + description above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative logsumexp is calculated, which will have size + `N+1` + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumlogsumexp_over_axis, + tai._cumlogsumexp_final_axis_include_initial, + tai._cumlogsumexp_dtype_supported, + _default_accumulation_dtype_fp_types, + ) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 8586ed935c..2494e130ab 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -164,6 +164,7 @@ def _reduction_over_axis( sycl_queue=q, depends=[cpy_e], ) + host_tasks_list.append(ht_e_red) ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( src=tmp_res, dst=res, sycl_queue=q, depends=[r_e] ) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index dbd105ac95..18e0e1bc8a 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -32,7 +32,9 @@ #include "dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" #include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" namespace dpctl { @@ -52,7 +54,7 @@ template T ceiling_quotient(T n, T m) template struct NonZeroIndicator { - NonZeroIndicator() {} + constexpr NonZeroIndicator() {} outputT operator()(const inputT &val) const { @@ -66,7 +68,7 @@ template struct NonZeroIndicator template struct NoOpTransformer { - NoOpTransformer() {} + constexpr NoOpTransformer() {} T operator()(const T &val) const { @@ -74,104 +76,238 @@ template struct NoOpTransformer } }; +template struct CastTransformer +{ + constexpr CastTransformer() {} + + dstTy operator()(const srcTy &val) const + { + using dpctl::tensor::type_utils::convert_impl; + return convert_impl(val); + } +}; + +template struct can_use_inclusive_scan_over_group +{ + static constexpr bool value = sycl::has_known_identity::value; +}; + +namespace +{ +template class stack_t +{ + T *src_; + size_t size_; + T *local_scans_; + +public: + stack_t() : src_{}, size_{}, local_scans_{} {} + stack_t(T *src, size_t sz, T *local_scans) + : src_(src), size_(sz), local_scans_(local_scans) + { + } + ~stack_t(){}; + + T *get_src_ptr() const + { + return src_; + } + + size_t get_size() const + { + return size_; + } + + T *get_local_scans_ptr() const + { + return local_scans_; + } +}; + +template class stack_strided_t +{ + T *src_; + size_t size_; + T *local_scans_; + size_t local_stride_; + +public: + stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {} + stack_strided_t(T *src, size_t sz, T *local_scans, size_t local_stride) + : src_(src), size_(sz), local_scans_(local_scans), + local_stride_(local_stride) + { + } + ~stack_strided_t(){}; + + T *get_src_ptr() const + { + return src_; + } + + size_t get_size() const + { + return size_; + } + + T *get_local_scans_ptr() const + { + return local_scans_; + } + + size_t get_local_stride() const + { + return local_stride_; + } +}; + +} // end of anonymous namespace + // Iterative cumulative summation +namespace su_ns = dpctl::tensor::sycl_utils; + using nwiT = std::uint16_t; template + typename IterIndexerT, + typename InpIndexerT, + typename OutIndexerT, + typename TransformerT, + typename ScanOpT, + bool include_initial> class inclusive_scan_iter_local_scan_krn; template + typename OtherTransformerT, + typename ScanOpT, + bool include_initial> class inclusive_scan_iter_chunk_update_krn; template + typename IterIndexerT, + typename InpIndexerT, + typename OutIndexerT, + typename TransformerT, + typename ScanOpT, + bool include_initial = false> sycl::event inclusive_scan_base_step(sycl::queue &exec_q, const size_t wg_size, - const size_t n_elems, + const size_t iter_nelems, + const size_t acc_nelems, const inputT *input, outputT *output, const size_t s0, const size_t s1, - const IndexerT &indexer, - const TransformerT &transformer, - size_t &n_groups, + const IterIndexerT &iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + TransformerT transformer, + const ScanOpT &scan_op, + outputT identity, + size_t &acc_groups, const std::vector &depends = {}) { - n_groups = ceiling_quotient(n_elems, n_wi * wg_size); + acc_groups = ceiling_quotient(acc_nelems, n_wi * wg_size); sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); using slmT = sycl::local_accessor; + auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size); auto lws = sycl::range<1>(wg_size); - auto gws = sycl::range<1>(n_groups * wg_size); auto ndRange = sycl::nd_range<1>(gws, lws); slmT slm_iscan_tmp(lws, cgh); - using KernelName = - inclusive_scan_iter_local_scan_krn; + using KernelName = inclusive_scan_iter_local_scan_krn< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>; cgh.parallel_for(ndRange, [=, slm_iscan_tmp = std::move(slm_iscan_tmp)]( sycl::nd_item<1> it) { - auto chunk_gid = it.get_global_id(0); - auto lid = it.get_local_id(0); + const size_t gid = it.get_global_id(0); + const size_t lid = it.get_local_id(0); + + const size_t iter_gid = gid / (acc_groups * wg_size); + const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size); - std::array local_isum; + std::array local_iscan; size_t i = chunk_gid * n_wi; + const auto &iter_offsets = iter_indexer(iter_gid); + const auto &inp_iter_offset = iter_offsets.get_first_offset(); + const auto &out_iter_offset = iter_offsets.get_second_offset(); #pragma unroll for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { - constexpr outputT out_zero(0); - - local_isum[m_wi] = - (i + m_wi < n_elems) - ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) - : out_zero; + if constexpr (!include_initial) { + local_iscan[m_wi] = + (i + m_wi < acc_nelems) + ? transformer( + input[inp_iter_offset + + inp_indexer(s0 + s1 * (i + m_wi))]) + : identity; + } + else { + // shift input to the left by a single element relative to + // output + local_iscan[m_wi] = + (i + m_wi < acc_nelems && i + m_wi > 0) + ? transformer( + input[inp_iter_offset + + inp_indexer((s0 + s1 * (i + m_wi)) - + 1)]) + : identity; + } } #pragma unroll for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += local_isum[m_wi - 1]; + local_iscan[m_wi] = + scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]); } - // local_isum is now result of + // local_iscan is now result of // inclusive scan of locally stored inputs - outputT wg_iscan_val = sycl::inclusive_scan_over_group( - it.get_group(), local_isum.back(), sycl::plus(), - outputT(0)); + outputT wg_iscan_val; + if constexpr (can_use_inclusive_scan_over_group::value) { + wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_iscan.back(), scan_op, identity); + } + else { + wg_iscan_val = su_ns::custom_inclusive_scan_over_group( + it.get_group(), slm_iscan_tmp, local_iscan.back(), scan_op); + // ensure all finished reading from SLM, to avoid race condition + // with subsequent writes into SLM + it.barrier(sycl::access::fence_space::local_space); + } slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; it.barrier(sycl::access::fence_space::local_space); - outputT addand = (lid == 0) ? outputT(0) : slm_iscan_tmp[lid]; + outputT addand = (lid == 0) ? identity : slm_iscan_tmp[lid]; #pragma unroll for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += addand; + local_iscan[m_wi] = scan_op(local_iscan[m_wi], addand); } - for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < n_elems); ++m_wi) { - output[i + m_wi] = local_isum[m_wi]; + for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < acc_nelems); + ++m_wi) { + output[out_iter_offset + out_indexer(i + m_wi)] = + local_iscan[m_wi]; } }); }); @@ -179,75 +315,62 @@ inclusive_scan_base_step(sycl::queue &exec_q, return inc_scan_phase1_ev; } -namespace -{ -template class stack_t -{ - T *src_; - size_t size_; - T *local_scans_; - -public: - stack_t() : src_{}, size_{}, local_scans_{} {} - stack_t(T *src, size_t sz, T *local_scans) - : src_(src), size_(sz), local_scans_(local_scans) - { - } - ~stack_t(){}; - - T *get_src_ptr() const - { - return src_; - } - - const T *get_src_const_ptr() const - { - return src_; - } - - size_t get_size() const - { - return size_; - } - - T *get_local_scans_ptr() const - { - return local_scans_; - } -}; -} // end of anonymous namespace +template +class inclusive_scan_1d_iter_chunk_update_krn; /* - * for integer type maskT, - * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) * for 0 <= j < n_elems */ template -sycl::event inclusive_scan_iter(sycl::queue &exec_q, - const size_t wg_size, - const size_t n_elems, - const inputT *input, - outputT *output, - const size_t s0, - const size_t s1, - const IndexerT &indexer, - const TransformerT &transformer, - std::vector &host_tasks, - const std::vector &depends = {}) + typename TransformerT, + typename ScanOpT, + bool include_initial> +sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, + const size_t wg_size, + const size_t n_elems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + const IndexerT &indexer, + const TransformerT &transformer, + std::vector &host_tasks, + const std::vector &depends = {}) { + ScanOpT scan_op = ScanOpT(); + constexpr outputT identity = su_ns::Identity::value; + + constexpr size_t _iter_nelems = 1; + + using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer; + constexpr IterIndexerT _no_op_iter_indexer{}; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + constexpr NoOpIndexerT _no_op_indexer{}; + size_t n_groups; sycl::event inc_scan_phase1_ev = - inclusive_scan_base_step( - exec_q, wg_size, n_elems, input, output, s0, s1, indexer, - transformer, n_groups, depends); + inclusive_scan_base_step( + exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1, + _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op, + identity, n_groups, depends); sycl::event dependent_event = inc_scan_phase1_ev; if (n_groups > 1) { - auto chunk_size = wg_size * n_wi; + const size_t chunk_size = wg_size * n_wi; // how much of temporary allocation do we need size_t n_groups_ = n_groups; @@ -272,19 +395,20 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, outputT *src = output; outputT *local_scans = temp; - NoOpIndexer _no_op_indexer{}; using NoOpTransformerT = NoOpTransformer; - NoOpTransformerT _no_op_transformer{}; + constexpr NoOpTransformerT _no_op_transformer{}; size_t size_to_update = n_elems; while (n_groups_ > 1) { - size_t src_size = n_groups_ - 1; + const size_t src_size = n_groups_ - 1; dependent_event = - inclusive_scan_base_step( - exec_q, wg_size, src_size, src, local_scans, chunk_size - 1, - chunk_size, _no_op_indexer, _no_op_transformer, - n_groups_, // n_groups_ is modified in place + inclusive_scan_base_step( + exec_q, wg_size, _iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, _no_op_iter_indexer, + _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op, + identity, n_groups_, // n_groups_ is modified in place {dependent_event}); stack.push_back({src, size_to_update, local_scans}); src = local_scans; @@ -295,43 +419,469 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size(); ++reverse_stack_id) { - auto stack_id = stack.size() - 1 - reverse_stack_id; + const size_t stack_id = stack.size() - 1 - reverse_stack_id; - auto stack_elem = stack[stack_id]; + const auto &stack_elem = stack[stack_id]; outputT *src = stack_elem.get_src_ptr(); - size_t src_size = stack_elem.get_size(); + const size_t src_size = stack_elem.get_size(); outputT *local_scans = stack_elem.get_local_scans_ptr(); // output[ chunk_size * (i + 1) + j] += temp[i] dependent_event = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependent_event); + constexpr nwiT updates_per_wi = n_wi; + const size_t n_items = ceiling_quotient(src_size, n_wi); + using UpdateKernelName = - class inclusive_scan_iter_chunk_update_krn< + class inclusive_scan_1d_iter_chunk_update_krn< inputT, outputT, n_wi, IndexerT, TransformerT, - NoOpIndexer, NoOpTransformerT>; + NoOpTransformerT, ScanOpT, include_initial>; + + cgh.parallel_for( + {n_items}, [chunk_size, src, src_size, local_scans, scan_op, + identity](auto wiid) { + const size_t gid = n_wi * wiid[0]; +#pragma unroll + for (size_t i = 0; i < updates_per_wi; ++i) { + const size_t src_id = gid + i; + if (src_id < src_size) { + const size_t scan_id = (src_id / chunk_size); + src[src_id] = + (scan_id > 0) + ? scan_op(src[src_id], + local_scans[scan_id - 1]) + : scan_op(src[src_id], identity); + } + } + }); + }); + } + + sycl::event free_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); + }); + host_tasks.push_back(free_ev); + } + + return dependent_event; +} + +typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)( + sycl::queue &, + size_t, + const char *, + char *, + std::vector &, + const std::vector &); + +template +sycl::event +accumulate_1d_contig_impl(sycl::queue &q, + size_t n_elems, + const char *src, + char *dst, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const srcT *src_data_ptr = reinterpret_cast(src); + dstT *dst_data_ptr = reinterpret_cast(dst); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + constexpr NoOpIndexerT flat_indexer{}; + constexpr transformerT transformer{}; + + constexpr size_t s0 = 0; + constexpr size_t s1 = 1; + + sycl::event comp_ev; + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + const size_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + const size_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + return comp_ev; +} + +template +class inclusive_scan_final_chunk_update_krn; + +template +sycl::event inclusive_scan_iter(sycl::queue &exec_q, + const size_t wg_size, + const size_t iter_nelems, + const size_t acc_nelems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + const InpIterIndexerT &inp_iter_indexer, + const OutIterIndexerT &out_iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + const TransformerT &transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + ScanOpT scan_op = ScanOpT(); + constexpr outputT identity = su_ns::Identity::value; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InpIterIndexerT, OutIterIndexerT>; + const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer}; + + size_t acc_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1, + iter_indexer, inp_indexer, out_indexer, transformer, scan_op, + identity, acc_groups, depends); + + sycl::event dependent_event = inc_scan_phase1_ev; + if (acc_groups > 1) { + const size_t chunk_size = wg_size * n_wi; + + // how much of temporary allocation do we need + size_t acc_groups_ = acc_groups; + size_t temp_size = 0; + while (acc_groups_ > 1) { + const size_t this_size = (acc_groups_ - 1); + temp_size += this_size; + acc_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + outputT *temp = + sycl::malloc_device(iter_nelems * temp_size, exec_q); + + if (!temp) { + throw std::bad_alloc(); + } + + std::vector> stack{}; + + // inclusive scans over blocks + acc_groups_ = acc_groups; + outputT *src = output; + outputT *local_scans = temp; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + constexpr NoOpIndexerT _no_op_indexer{}; + using NoOpTransformerT = NoOpTransformer; + constexpr NoOpTransformerT _no_op_transformer{}; + size_t size_to_update = acc_nelems; + + { + size_t src_size = acc_groups - 1; + using LocalScanIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + const LocalScanIndexerT scan_iter_indexer{ + 0, static_cast(iter_nelems), + static_cast(src_size)}; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + OutIterIndexerT, LocalScanIndexerT>; + const IterIndexerT iter_indexer_{out_iter_indexer, + scan_iter_indexer}; + + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, iter_indexer_, out_indexer, + _no_op_indexer, _no_op_transformer, scan_op, identity, + acc_groups_, // acc_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans, src_size}); + src = local_scans; + local_scans += src_size * iter_nelems; + size_to_update = src_size; + } + + while (acc_groups_ > 1) { + size_t src_size = acc_groups_ - 1; + + using LocalScanIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + const LocalScanIndexerT scan1_iter_indexer{ + 0, static_cast(iter_nelems), + static_cast(size_to_update)}; + const LocalScanIndexerT scan2_iter_indexer{ + 0, static_cast(iter_nelems), + static_cast(src_size)}; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + LocalScanIndexerT, LocalScanIndexerT>; + const IterIndexerT iter_indexer_{scan1_iter_indexer, + scan2_iter_indexer}; + + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer, + _no_op_indexer, _no_op_transformer, scan_op, identity, + acc_groups_, // acc_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans, src_size}); + src = local_scans; + local_scans += src_size * iter_nelems; + size_to_update = src_size; + } + + for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size() - 1; + ++reverse_stack_id) + { + const size_t stack_id = stack.size() - 1 - reverse_stack_id; + + const auto &stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + size_t local_stride = stack_elem.get_local_stride(); + + constexpr nwiT updates_per_wi = n_wi; + const size_t update_nelems = + ceiling_quotient(src_size, updates_per_wi); + + dependent_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + using UpdateKernelName = + class inclusive_scan_iter_chunk_update_krn< + inputT, outputT, n_wi, TransformerT, NoOpTransformerT, + ScanOpT, include_initial>; + + cgh.parallel_for( + {iter_nelems * update_nelems}, + [chunk_size, update_nelems, src_size, local_stride, src, + local_scans, scan_op, identity](auto wiid) { + const size_t gid = wiid[0]; + + const size_t iter_gid = gid / update_nelems; + const size_t axis_gid = + gid - (iter_gid * update_nelems); + + const size_t src_axis_id0 = axis_gid * updates_per_wi; + const size_t src_iter_id = iter_gid * src_size; +#pragma unroll + for (nwiT i = 0; i < updates_per_wi; ++i) { + const size_t src_axis_id = src_axis_id0 + i; + const size_t src_id = src_axis_id + src_iter_id; + + if (src_axis_id < src_size) { + const size_t scan_axis_id = + src_axis_id / chunk_size; + const size_t scan_id = + scan_axis_id + iter_gid * local_stride; + + src[src_id] = + (scan_axis_id > 0) + ? scan_op(src[src_id], + local_scans[scan_id - 1]) + : scan_op(src[src_id], identity); + } + } + }); + }); + } + + // last stack element is always directly to output + { + const auto &stack_elem = stack[0]; + outputT *src = stack_elem.get_src_ptr(); + const size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + const size_t local_stride = stack_elem.get_local_stride(); + + constexpr nwiT updates_per_wi = n_wi; + const size_t update_nelems = + ceiling_quotient(src_size, updates_per_wi); + + dependent_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + using UpdateKernelName = + class inclusive_scan_final_chunk_update_krn< + inputT, outputT, n_wi, OutIterIndexerT, OutIndexerT, + TransformerT, NoOpTransformerT, ScanOpT, + include_initial>; cgh.parallel_for( - {src_size}, [chunk_size, src, local_scans](auto wiid) { - auto gid = wiid[0]; - auto i = (gid / chunk_size); - src[gid] += (i > 0) ? local_scans[i - 1] : outputT(0); + {iter_nelems * update_nelems}, + [chunk_size, update_nelems, src_size, local_stride, src, + local_scans, scan_op, identity, out_iter_indexer, + out_indexer](auto wiid) { + const size_t gid = wiid[0]; + + const size_t iter_gid = gid / update_nelems; + const size_t axis_gid = + gid - (iter_gid * update_nelems); + + const size_t src_axis_id0 = axis_gid * updates_per_wi; + const size_t src_iter_id = out_iter_indexer(iter_gid); +#pragma unroll + for (nwiT i = 0; i < updates_per_wi; ++i) { + const size_t src_axis_id = src_axis_id0 + i; + const size_t src_id = + out_indexer(src_axis_id) + src_iter_id; + + if (src_axis_id < src_size) { + const size_t scan_axis_id = + src_axis_id / chunk_size; + const size_t scan_id = + scan_axis_id + iter_gid * local_stride; + + src[src_id] = + (scan_axis_id > 0) + ? scan_op(src[src_id], + local_scans[scan_id - 1]) + : scan_op(src[src_id], identity); + } + } }); }); } - sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) { + sycl::event free_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependent_event); const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); }); - host_tasks.push_back(e4); + host_tasks.push_back(free_ev); } return dependent_event; } -typedef size_t (*accumulate_contig_impl_fn_ptr_t)( +typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)( + sycl::queue &, + size_t, + size_t, + const char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + char *, + std::vector &, + const std::vector &); + +template +sycl::event +accumulate_strided_impl(sycl::queue &q, + size_t iter_nelems, + size_t acc_nelems, + const char *src, + int iter_nd, + const ssize_t *iter_shape_strides, + ssize_t inp_iter_offset, + ssize_t out_iter_offset, + int acc_nd, + const ssize_t *acc_shape_strides, + char *dst, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const srcT *src_data_ptr = reinterpret_cast(src); + dstT *dst_data_ptr = reinterpret_cast(dst); + + using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides}; + const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset, + iter_shape_strides}; + + using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides, + acc_shape_strides + 2 * acc_nd}; + const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset, + iter_shape_strides, + iter_shape_strides + 2 * iter_nd}; + + constexpr transformerT transformer{}; + + constexpr size_t s0 = 0; + constexpr size_t s1 = 1; + + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + const size_t wg_size = 256; + comp_ev = + inclusive_scan_iter( + q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr, + s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer, + out_axis_indexer, transformer, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + const size_t wg_size = 256; + comp_ev = + inclusive_scan_iter( + q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr, + s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer, + out_axis_indexer, transformer, host_tasks, depends); + } + + return comp_ev; +} + +typedef size_t (*cumsum_val_contig_impl_fn_ptr_t)( sycl::queue &, size_t, const char *, @@ -340,7 +890,7 @@ typedef size_t (*accumulate_contig_impl_fn_ptr_t)( const std::vector &); template -size_t accumulate_contig_impl(sycl::queue &q, +size_t cumsum_val_contig_impl(sycl::queue &q, size_t n_elems, const char *mask, char *cumsum, @@ -350,29 +900,34 @@ size_t accumulate_contig_impl(sycl::queue &q, const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - NoOpIndexer flat_indexer{}; - transformerT non_zero_indicator{}; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + constexpr NoOpIndexerT flat_indexer{}; + constexpr transformerT transformer{}; - sycl::event comp_ev; + constexpr size_t s0 = 0; + constexpr size_t s1 = 1; + constexpr bool include_initial = false; + using AccumulateOpT = sycl::plus; + sycl::event comp_ev; const sycl::device &dev = q.get_device(); if (dev.has(sycl::aspect::cpu)) { constexpr nwiT n_wi_for_cpu = 8; - size_t wg_size = 256; - comp_ev = inclusive_scan_iter( - q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, - flat_indexer, non_zero_indicator, host_tasks, depends); + const size_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); } else { constexpr nwiT n_wi_for_gpu = 4; - size_t wg_size = 256; - comp_ev = inclusive_scan_iter( - q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, - flat_indexer, non_zero_indicator, host_tasks, depends); + const size_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); @@ -398,7 +953,7 @@ template struct MaskPositionsContigFactoryForInt32 { using cumsumT = std::int32_t; fnT fn = - accumulate_contig_impl>; + cumsum_val_contig_impl>; return fn; } }; @@ -409,7 +964,7 @@ template struct MaskPositionsContigFactoryForInt64 { using cumsumT = std::int64_t; fnT fn = - accumulate_contig_impl>; + cumsum_val_contig_impl>; return fn; } }; @@ -421,7 +976,7 @@ template struct Cumsum1DContigFactory if constexpr (std::is_integral_v) { using cumsumT = std::int64_t; fnT fn = - accumulate_contig_impl>; + cumsum_val_contig_impl>; return fn; } else { @@ -430,7 +985,7 @@ template struct Cumsum1DContigFactory } }; -typedef size_t (*accumulate_strided_impl_fn_ptr_t)( +typedef size_t (*cumsum_val_strided_impl_fn_ptr_t)( sycl::queue &, size_t, const char *, @@ -441,7 +996,7 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)( const std::vector &); template -size_t accumulate_strided_impl(sycl::queue &q, +size_t cumsum_val_strided_impl(sycl::queue &q, size_t n_elems, const char *mask, int nd, @@ -453,28 +1008,34 @@ size_t accumulate_strided_impl(sycl::queue &q, const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - StridedIndexer strided_indexer{nd, 0, shape_strides}; - transformerT non_zero_indicator{}; + using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const StridedIndexerT strided_indexer{nd, 0, shape_strides}; + constexpr transformerT transformer{}; + + constexpr size_t s0 = 0; + constexpr size_t s1 = 1; + constexpr bool include_initial = false; + using AccumulateOpT = sycl::plus; const sycl::device &dev = q.get_device(); sycl::event comp_ev; if (dev.has(sycl::aspect::cpu)) { constexpr nwiT n_wi_for_cpu = 8; - size_t wg_size = 256; - comp_ev = inclusive_scan_iter( - q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, - strided_indexer, non_zero_indicator, host_tasks, depends); + const size_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + strided_indexer, transformer, host_tasks, depends); } else { constexpr nwiT n_wi_for_gpu = 4; - size_t wg_size = 256; - comp_ev = inclusive_scan_iter( - q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, - strided_indexer, non_zero_indicator, host_tasks, depends); + const size_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + strided_indexer, transformer, host_tasks, depends); } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); @@ -501,7 +1062,7 @@ template struct MaskPositionsStridedFactoryForInt32 { using cumsumT = std::int32_t; fnT fn = - accumulate_strided_impl>; + cumsum_val_strided_impl>; return fn; } }; @@ -512,7 +1073,7 @@ template struct MaskPositionsStridedFactoryForInt64 { using cumsumT = std::int64_t; fnT fn = - accumulate_strided_impl>; + cumsum_val_strided_impl>; return fn; } }; @@ -524,7 +1085,7 @@ template struct Cumsum1DStridedFactory if constexpr (std::is_integral_v) { using cumsumT = std::int64_t; fnT fn = - accumulate_strided_impl>; + cumsum_val_strided_impl>; return fn; } else { diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 6301b3b9bc..75ab1d9341 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -25,6 +25,7 @@ #pragma once #include #include +#include #include #include #include @@ -154,6 +155,31 @@ T custom_reduce_over_group(const GroupT &wg, return sycl::group_broadcast(wg, red_val_over_wg); } +template +T custom_inclusive_scan_over_group(const GroupT &wg, + LocAccT local_mem_acc, + const T local_val, + const OpT &op) +{ + const std::uint32_t local_id = wg.get_local_id(0); + const std::uint32_t wgs = wg.get_local_range(0); + local_mem_acc[local_id] = local_val; + + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + if (wg.leader()) { + T scan_val = local_mem_acc[0]; + for (std::uint32_t i = 1; i < wgs; ++i) { + scan_val = op(local_mem_acc[i], scan_val); + local_mem_acc[i] = scan_val; + } + } + + // ensure all work-items see the same SLM that leader updated + sycl::group_barrier(wg, sycl::memory_scope::work_group); + return local_mem_acc[local_id]; +} + // Reduction functors // Maximum diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index b82154558d..3a5a93620e 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -49,23 +49,23 @@ namespace py_internal namespace td_ns = dpctl::tensor::type_dispatch; -using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t; -static accumulate_contig_impl_fn_ptr_t +using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t; +static cumsum_val_contig_impl_fn_ptr_t mask_positions_contig_i64_dispatch_vector[td_ns::num_types]; -static accumulate_contig_impl_fn_ptr_t +static cumsum_val_contig_impl_fn_ptr_t mask_positions_contig_i32_dispatch_vector[td_ns::num_types]; -using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; -static accumulate_strided_impl_fn_ptr_t +using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t; +static cumsum_val_strided_impl_fn_ptr_t mask_positions_strided_i64_dispatch_vector[td_ns::num_types]; -static accumulate_strided_impl_fn_ptr_t +static cumsum_val_strided_impl_fn_ptr_t mask_positions_strided_i32_dispatch_vector[td_ns::num_types]; void populate_mask_positions_dispatch_vectors(void) { using dpctl::tensor::kernels::accumulators:: MaskPositionsContigFactoryForInt64; - td_ns::DispatchVectorBuilder dvb1; @@ -73,7 +73,7 @@ void populate_mask_positions_dispatch_vectors(void) using dpctl::tensor::kernels::accumulators:: MaskPositionsContigFactoryForInt32; - td_ns::DispatchVectorBuilder dvb2; @@ -81,7 +81,7 @@ void populate_mask_positions_dispatch_vectors(void) using dpctl::tensor::kernels::accumulators:: MaskPositionsStridedFactoryForInt64; - td_ns::DispatchVectorBuilder dvb3; @@ -89,7 +89,7 @@ void populate_mask_positions_dispatch_vectors(void) using dpctl::tensor::kernels::accumulators:: MaskPositionsStridedFactoryForInt32; - td_ns::DispatchVectorBuilder dvb4; @@ -226,23 +226,23 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, return total_set; } -using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; -static accumulate_strided_impl_fn_ptr_t +using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t; +static cumsum_val_strided_impl_fn_ptr_t cumsum_1d_strided_dispatch_vector[td_ns::num_types]; -using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t; -static accumulate_contig_impl_fn_ptr_t +using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t; +static cumsum_val_contig_impl_fn_ptr_t cumsum_1d_contig_dispatch_vector[td_ns::num_types]; void populate_cumsum_1d_dispatch_vectors(void) { using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory; - td_ns::DispatchVectorBuilder dvb1; dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector); using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory; - td_ns::DispatchVectorBuilder dvb2; dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector); diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp new file mode 100644 index 0000000000..b051c1703d --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -0,0 +1,457 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/accumulators.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +template +std::pair +py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_accumulate, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + std::vector const &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iter_nd = src_nd - trailing_dims_to_accumulate; + if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) { + throw py::value_error( + "trailing_dims_to_accumulate must be positive, but no " + "greater than rank of the input array"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + size_t iter_nelems(1); + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + size_t acc_nelems(1); + for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { + auto dst_shape_i = dst_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (acc_nelems == 0)) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, acc_nelems * iter_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + std::vector host_task_events; + + if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) { + auto fn = contig_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data, + host_task_events, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}), + acc_ev); + } + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + int acc_nd = trailing_dims_to_accumulate; + + using shT = std::vector; + shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec)); + + shT acc_src_strides(std::begin(src_strides_vec) + iter_nd, + std::end(src_strides_vec)); + + shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd, + std::end(dst_strides_vec)); + + shT iter_shape(std::begin(src_shape_vec), + std::begin(src_shape_vec) + iter_nd); + + shT iter_src_strides(std::begin(src_strides_vec), + std::begin(src_strides_vec) + iter_nd); + + shT iter_dst_strides(std::begin(dst_strides_vec), + std::begin(dst_strides_vec) + iter_nd); + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + // Strided implementation + auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, + acc_src_strides, acc_dst_strides); + py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple); + if (packed_shapes_and_strides == nullptr) { + throw std::runtime_error("Unexpected error"); + } + const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + + py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + py::ssize_t *acc_shapes_and_strides = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), copy_shapes_strides_ev); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + + sycl::event acc_ev = strided_fn( + exec_q, iter_nelems, acc_nelems, src_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, + acc_shapes_and_strides, dst_data, host_task_events, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(acc_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_shapes_and_strides] { + sycl::free(packed_shapes_and_strides, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events), + acc_ev); +} + +template +std::pair py_accumulate_final_axis_include_initial( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + std::vector const &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + + constexpr int acc_nd = 1; + + int iter_nd = src_nd - acc_nd; + if (iter_nd < 0) { + throw py::value_error("accumulation axis must not be greater than rank " + "of the input array"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + size_t iter_nelems(1); + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + size_t acc_nelems(1); + for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { + auto dst_shape_i = dst_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (acc_nelems == 0)) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, acc_nelems * iter_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + std::vector host_task_events; + + if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) { + auto fn = contig_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data, + host_task_events, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}), + acc_ev); + } + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + using shT = std::vector; + shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec)); + + shT acc_src_strides(std::begin(src_strides_vec) + iter_nd, + std::end(src_strides_vec)); + + shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd, + std::end(dst_strides_vec)); + + shT iter_shape(std::begin(src_shape_vec), + std::begin(src_shape_vec) + iter_nd); + + shT iter_src_strides(std::begin(src_strides_vec), + std::begin(src_strides_vec) + iter_nd); + + shT iter_dst_strides(std::begin(dst_strides_vec), + std::begin(dst_strides_vec) + iter_nd); + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + // Strided implementation + auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, + acc_src_strides, acc_dst_strides); + py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple); + if (packed_shapes_and_strides == nullptr) { + throw std::runtime_error("Unexpected error"); + } + const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + + py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + py::ssize_t *acc_shapes_and_strides = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), copy_shapes_strides_ev); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + + sycl::event acc_ev = strided_fn( + exec_q, iter_nelems, acc_nelems, src_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, + acc_shapes_and_strides, dst_data, host_task_events, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(acc_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_shapes_and_strides] { + sycl::free(packed_shapes_and_strides, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events), + acc_ev); +} + +/*! @brief Template implementing Python API for querying accumulation + * type support */ +template +bool py_accumulate_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const fnT &dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + fn = dispatch_table[arg_typeid][out_typeid]; + + return (fn != nullptr); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp new file mode 100644 index 0000000000..4b7de25df7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp @@ -0,0 +1,50 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "cumulative_logsumexp.hpp" +#include "cumulative_prod.hpp" +#include "cumulative_sum.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +/*! @brief Add accumulators to Python module */ +void init_accumulator_functions(py::module_ m) +{ + init_cumulative_logsumexp(m); + init_cumulative_prod(m); + init_cumulative_sum(m); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp new file mode 100644 index 0000000000..a61eb3a63a --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_accumulator_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp new file mode 100644 index 0000000000..4cf2ec6fe9 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp @@ -0,0 +1,344 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace su_ns = dpctl::tensor::sycl_utils; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForLogSumExpAccumulation +{ + static constexpr bool is_defined = std::disjunction< + // disjunction is C++17 feature, supported by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct CumLogSumExp1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExp1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExpStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExpIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumlogsumexp_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumlogsumexp_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + cumlogsumexp_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_logsumexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumlogsumexp_dispatch_tables; + populate_cumlogsumexp_dispatch_tables(); + + using impl::cumlogsumexp_1d_contig_dispatch_table; + using impl::cumlogsumexp_strided_dispatch_table; + auto cumlogsumexp_pyapi = [&](const arrayT &src, + int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_accumulate_over_axis; + return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst, + exec_q, depends, + cumlogsumexp_strided_dispatch_table, + cumlogsumexp_1d_contig_dispatch_table); + }; + m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table; + using impl::cumlogsumexp_include_initial_strided_dispatch_table; + auto cumlogsumexp_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal:: + py_accumulate_final_axis_include_initial; + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumlogsumexp_include_initial_strided_dispatch_table, + cumlogsumexp_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumlogsumexp_final_axis_include_initial", + cumlogsumexp_include_initial_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_accumulate_dtype_supported; + return py_accumulate_dtype_supported( + input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table); + }; + m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp new file mode 100644 index 0000000000..991508d5d7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cumulative_logsumexp(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp new file mode 100644 index 0000000000..7982e0ff41 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -0,0 +1,348 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace su_ns = dpctl::tensor::sycl_utils; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumprod_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForProdAccumulation +{ + + // disjunction is C++17 feature, supported by DPC++ input bool + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct CumProd1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = sycl::multiplies; + constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProd1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = sycl::multiplies; + constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProdStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = sycl::multiplies; + constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProdIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = sycl::multiplies; + constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumprod_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumprod_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumprod_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + cumprod_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_prod(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumprod_dispatch_tables; + populate_cumprod_dispatch_tables(); + + using impl::cumprod_1d_contig_dispatch_table; + using impl::cumprod_strided_dispatch_table; + auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_accumulate_over_axis; + return py_accumulate_over_axis( + src, trailing_dims_to_accumulate, dst, exec_q, depends, + cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table); + }; + m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumprod_1d_include_initial_contig_dispatch_table; + using impl::cumprod_include_initial_strided_dispatch_table; + auto cumprod_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal:: + py_accumulate_final_axis_include_initial; + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumprod_include_initial_strided_dispatch_table, + cumprod_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi, + "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumprod_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_accumulate_dtype_supported; + return py_accumulate_dtype_supported(input_dtype, output_dtype, + cumprod_strided_dispatch_table); + }; + m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp new file mode 100644 index 0000000000..b9cced2879 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cumulative_prod(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp new file mode 100644 index 0000000000..59e5830fe6 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -0,0 +1,347 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace su_ns = dpctl::tensor::sycl_utils; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumsum_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForSumAccumulation +{ + + // disjunction is C++17 feature, supported by DPC++ input bool + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct CumSum1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = sycl::plus; + constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSum1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = sycl::plus; + constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSumStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = sycl::plus; + constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSumIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = sycl::plus; + constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumsum_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumsum_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumsum_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_sum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumsum_dispatch_tables; + populate_cumsum_dispatch_tables(); + + using impl::cumsum_1d_contig_dispatch_table; + using impl::cumsum_strided_dispatch_table; + auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_accumulate_over_axis; + return py_accumulate_over_axis( + src, trailing_dims_to_accumulate, dst, exec_q, depends, + cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table); + }; + m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumsum_1d_include_initial_contig_dispatch_table; + using impl::cumsum_include_initial_strided_dispatch_table; + auto cumsum_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal:: + py_accumulate_final_axis_include_initial; + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumsum_include_initial_strided_dispatch_table, + cumsum_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi, + "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumsum_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_accumulate_dtype_supported; + return py_accumulate_dtype_supported(input_dtype, output_dtype, + cumsum_strided_dispatch_table); + }; + m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp new file mode 100644 index 0000000000..68a4d5d282 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cumulative_sum(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index f3f8f0b9f1..1e0be8adb3 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -296,9 +296,9 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), orthog_dst_shape.begin())); - std::vector simplified_orthog_shape; - std::vector simplified_orthog_src_strides; - std::vector simplified_orthog_dst_strides; + shT simplified_orthog_shape; + shT simplified_orthog_src_strides; + shT simplified_orthog_dst_strides; const py::ssize_t *_shape = orthog_src_shape.data(); @@ -671,9 +671,9 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), orthog_dst_shape.begin())); - std::vector simplified_orthog_shape; - std::vector simplified_orthog_src_strides; - std::vector simplified_orthog_dst_strides; + shT simplified_orthog_shape; + shT simplified_orthog_src_strides; + shT simplified_orthog_dst_strides; const py::ssize_t *_shape = orthog_src_shape.data(); diff --git a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp new file mode 100644 index 0000000000..9aaa9bbc38 --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp @@ -0,0 +1,35 @@ +//===-- tensor_accumulation.cpp - --*-C++-*-/===// +// Implementation of _tensor_accumulation_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include + +#include "accumulators/accumulators_common.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_accumulation_impl, m) +{ + dpctl::tensor::py_internal::init_accumulator_functions(m); +} diff --git a/dpctl/tests/test_tensor_accumulation.py b/dpctl/tests/test_tensor_accumulation.py new file mode 100644 index 0000000000..97194ed5eb --- /dev/null +++ b/dpctl/tests/test_tensor_accumulation.py @@ -0,0 +1,411 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from random import randrange + +import pytest +from helper import get_queue_or_skip, skip_if_dtype_not_supported + +import dpctl.tensor as dpt +from dpctl.utils import ExecutionPlacementError + +sint_types = [ + dpt.int8, + dpt.int16, + dpt.int32, + dpt.int64, +] +uint_types = [ + dpt.uint8, + dpt.uint16, + dpt.uint32, + dpt.uint64, +] +rfp_types = [ + dpt.float16, + dpt.float32, + dpt.float64, +] +cfp_types = [ + dpt.complex64, + dpt.complex128, +] + +no_complex_types = [dpt.bool] + sint_types + uint_types + rfp_types + +all_types = [dpt.bool] + sint_types + uint_types + rfp_types + cfp_types + + +@pytest.mark.parametrize("dt", sint_types) +def test_contig_cumsum_sint(dt): + get_queue_or_skip() + n = 10000 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n) + + res = dpt.cumulative_sum(x, dtype=dt) + + ar = dpt.arange(n, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == expected) + + +@pytest.mark.parametrize("dt", sint_types) +def test_strided_cumsum_sint(dt): + get_queue_or_skip() + n = 10000 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n)[1::2] + + res = dpt.cumulative_sum(x, dtype=dt) + + ar = dpt.arange(n, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == expected) + + x2 = dpt.repeat(dpt.asarray([-1, 1], dtype=dt), 2 * n)[-1::-2] + + res = dpt.cumulative_sum(x2, dtype=dt) + + ar = dpt.arange(n, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == expected) + + +@pytest.mark.parametrize("dt", sint_types) +def test_contig_cumsum_axis_sint(dt): + get_queue_or_skip() + n0, n1 = 1000, 173 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n0) + m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1)) + + res = dpt.cumulative_sum(m, dtype=dt, axis=0) + + ar = dpt.arange(n0, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == dpt.expand_dims(expected, axis=1)) + + +@pytest.mark.parametrize("dt", sint_types) +def test_strided_cumsum_axis_sint(dt): + get_queue_or_skip() + n0, n1 = 1000, 173 + x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n0) + m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))[1::2, ::-1] + + res = dpt.cumulative_sum(m, dtype=dt, axis=0) + + ar = dpt.arange(n0, dtype=dt) + expected = dpt.concat((1 + ar, dpt.flip(ar))) + assert dpt.all(res == dpt.expand_dims(expected, axis=1)) + + +def test_accumulate_scalar(): + get_queue_or_skip() + + s = dpt.asarray(1, dtype="i8") + r = dpt.cumulative_sum(s) + assert r == s + assert r.ndim == s.ndim + + r = dpt.cumulative_sum(s, include_initial=True) + r_expected = dpt.asarray([0, 1], dtype="i8") + assert dpt.all(r == r_expected) + + +def test_cumulative_sum_include_initial(): + get_queue_or_skip() + + n0, n1 = 3, 5 + x = dpt.ones((n0, n1), dtype="i4") + r = dpt.cumulative_sum(x, axis=0, include_initial=True) + assert dpt.all(r[0, :] == 0) + + r = dpt.cumulative_sum(x, axis=1, include_initial=True) + assert dpt.all(r[:, 0] == 0) + + x = dpt.ones(n1, dtype="i4") + r = dpt.cumulative_sum(x, include_initial=True) + assert r.shape == (n1 + 1,) + assert r[0] == 0 + + +def test_cumulative_prod_identity(): + get_queue_or_skip() + + x = dpt.zeros(5, dtype="i4") + r = dpt.cumulative_prod(x, include_initial=True) + assert r[0] == 1 + + +def test_cumulative_logsumexp_identity(): + get_queue_or_skip() + + x = dpt.ones(5, dtype="f4") + r = dpt.cumulative_logsumexp(x, include_initial=True) + assert r[0] == -dpt.inf + + +def test_accumulate_zero_size_dims(): + get_queue_or_skip() + + n0, n1, n2 = 3, 0, 5 + x = dpt.ones((n0, n1, n2), dtype="i8") + r = dpt.cumulative_sum(x, axis=1) + assert r.shape == x.shape + assert r.size == 0 + + r = dpt.cumulative_sum(x, axis=0) + assert r.shape == x.shape + assert r.size == 0 + + r = dpt.cumulative_sum(x, axis=1, include_initial=True) + assert r.shape == (n0, n1 + 1, n2) + assert r.size == (n0 * n2) + + r = dpt.cumulative_sum(x, axis=0, include_initial=True) + assert r.shape == (n0 + 1, n1, n2) + assert r.size == 0 + + +@pytest.mark.parametrize("arg_dtype", all_types) +def test_cumsum_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + n = 100 + x = dpt.ones(n, dtype=arg_dtype) + r = dpt.cumulative_sum(x) + + assert isinstance(r, dpt.usm_ndarray) + if x.dtype.kind == "i": + assert r.dtype.kind == "i" + elif x.dtype.kind == "u": + assert r.dtype.kind == "u" + elif x.dtype.kind == "fc": + assert r.dtype == arg_dtype + + r_expected = dpt.arange(1, n + 1, dtype=r.dtype) + + assert dpt.all(r == r_expected) + + +@pytest.mark.parametrize("arg_dtype", all_types) +@pytest.mark.parametrize("out_dtype", all_types) +def test_cumsum_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + n = 100 + x = dpt.ones(n, dtype=arg_dtype) + r = dpt.cumulative_sum(x, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + if out_dtype == dpt.bool: + assert dpt.all(r) + else: + r_expected = dpt.arange(1, n + 1, dtype=out_dtype) + assert dpt.all(r == r_expected) + + +def test_accumulator_out_kwarg(): + q = get_queue_or_skip() + + n = 100 + + expected = dpt.arange(1, n + 1, dtype="i4", sycl_queue=q) + x = dpt.ones(n, dtype="i4", sycl_queue=q) + out = dpt.empty_like(x, dtype="i4") + dpt.cumulative_sum(x, dtype="i4", out=out) + assert dpt.all(expected == out) + + # overlap + x = dpt.ones(n, dtype="i4", sycl_queue=q) + dpt.cumulative_sum(x, dtype="i4", out=x) + assert dpt.all(x == expected) + + # axis before final axis + expected = dpt.broadcast_to( + dpt.arange(1, n + 1, dtype="i4", sycl_queue=q), (n, n) + ).mT + x = dpt.ones((n, n), dtype="i4", sycl_queue=q) + out = dpt.empty_like(x, dtype="i4") + dpt.cumulative_sum(x, axis=0, dtype="i4", out=out) + assert dpt.all(expected == out) + + # scalar + x = dpt.asarray(3, dtype="i4") + out = dpt.empty((), dtype="i4") + expected = 3 + dpt.cumulative_sum(x, dtype="i4", out=out) + assert expected == out + + +def test_accumulator_arg_validation(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + n = 5 + x1 = dpt.ones((n, n), dtype="f4", sycl_queue=q1) + x2 = dpt.ones(n, dtype="f4", sycl_queue=q1) + + # must be usm_ndarray + with pytest.raises(TypeError): + dpt.cumulative_sum(dict()) + + # axis must be specified when input not 1D + with pytest.raises(ValueError): + dpt.cumulative_sum(x1) + + # out must be usm_ndarray + with pytest.raises(TypeError): + dpt.cumulative_sum(x2, out=dict()) + + # out must be writable + out_not_writable = dpt.empty_like(x2) + out_not_writable.flags.writable = False + with pytest.raises(ValueError): + dpt.cumulative_sum(x2, out=out_not_writable) + + # out must be expected shape + out_wrong_shape = dpt.ones(n + 1, dtype=x2.dtype, sycl_queue=q1) + with pytest.raises(ValueError): + dpt.cumulative_sum(x2, out=out_wrong_shape) + + # out must be expected dtype + out_wrong_dtype = dpt.empty_like(x2, dtype="i4") + with pytest.raises(ValueError): + dpt.cumulative_sum(x2, out=out_wrong_dtype) + + # compute follows data + out_wrong_queue = dpt.empty_like(x2, sycl_queue=q2) + with pytest.raises(ExecutionPlacementError): + dpt.cumulative_sum(x2, out=out_wrong_queue) + + +def test_cumsum_nan_propagation(): + get_queue_or_skip() + + n = 100 + x = dpt.ones(n, dtype="f4") + i = randrange(n) + x[i] = dpt.nan + + r = dpt.cumulative_sum(x) + assert dpt.all(dpt.isnan(r[i:])) + + +def test_cumprod_nan_propagation(): + get_queue_or_skip() + + n = 100 + x = dpt.ones(n, dtype="f4") + i = randrange(n) + x[i] = dpt.nan + + r = dpt.cumulative_prod(x) + assert dpt.all(dpt.isnan(r[i:])) + + +def test_logcumsumexp_nan_propagation(): + get_queue_or_skip() + + n = 100 + x = dpt.ones(n, dtype="f4") + i = randrange(n) + x[i] = dpt.nan + + r = dpt.cumulative_logsumexp(x) + assert dpt.all(dpt.isnan(r[i:])) + + +@pytest.mark.parametrize("arg_dtype", no_complex_types) +def test_logcumsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.ones(10, dtype=arg_dtype, sycl_queue=q) + r = dpt.cumulative_logsumexp(x) + + if arg_dtype.kind in "biu": + assert r.dtype.kind == "f" + else: + assert r.dtype == arg_dtype + + +def test_logcumsumexp_complex_error(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="c8") + with pytest.raises(ValueError): + dpt.cumulative_logsumexp(x) + + +def test_cumprod_basic(): + get_queue_or_skip() + + n = 50 + val = 2 + x = dpt.full(n, val, dtype="i8") + r = dpt.cumulative_prod(x) + expected = dpt.pow(val, dpt.arange(1, n + 1, dtype="i8")) + + assert dpt.all(r == expected) + + x = dpt.tile(dpt.asarray([2, 0.5], dtype="f4"), 10000) + expected = dpt.tile(dpt.asarray([2, 1], dtype="f4"), 10000) + r = dpt.cumulative_prod(x) + assert dpt.all(r == expected) + + +def test_logcumsumexp_basic(): + get_queue_or_skip() + + dt = dpt.float32 + x = dpt.ones(1000, dtype=dt) + r = dpt.cumulative_logsumexp(x) + + expected = 1 + dpt.log(dpt.arange(1, 1001, dtype=dt)) + + tol = 4 * dpt.finfo(dt).resolution + assert dpt.allclose(r, expected, atol=tol, rtol=tol) + + +def geometric_series_closed_form(n, dtype=None, device=None): + """Closed form for cumulative_logsumexp(dpt.arange(-n, 0)) + + :math:`r[k] == -n + k + log(1 - exp(-k-1)) - log(1-exp(-1))` + """ + x = dpt.arange(-n, 0, dtype=dtype, device=device) + y = dpt.arange(-1, -n - 1, step=-1, dtype=dtype, device=device) + y = dpt.exp(y, out=y) + y = dpt.negative(y, out=y) + y = dpt.log1p(y, out=y) + y -= y[0] + return x + y + + +@pytest.mark.parametrize("fpdt", rfp_types) +def test_cumulative_logsumexp_closed_form(fpdt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(fpdt, q) + + n = 128 + r = dpt.cumulative_logsumexp(dpt.arange(-n, 0, dtype=fpdt, device=q)) + expected = geometric_series_closed_form(n, dtype=fpdt, device=q) + + tol = 4 * dpt.finfo(fpdt).eps + assert dpt.allclose(r, expected, atol=tol, rtol=tol)