diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index fe7f85be09..6639c09f9b 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -158,6 +158,17 @@ set(_tensor_linalg_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
     ${_linalg_sources}
 )
+set(_accumulator_sources
+${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp
+${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
+${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
+${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
+)
+set(_tensor_accumulation_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${_accumulator_sources}
+)
 
 set(_py_trgts)
 
@@ -186,6 +197,11 @@ pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_accumulation_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
@@ -203,6 +219,7 @@ list(APPEND _no_fast_math_sources
      ${_reduction_sources}
      ${_sorting_sources}
      ${_linalg_sources}
+     ${_accumulator_sources}
 )
 
 foreach(_src_fn ${_no_fast_math_sources})
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 6652243128..414333bf5c 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -96,6 +96,7 @@
 from dpctl.tensor._usmarray import usm_ndarray
 from dpctl.tensor._utility_functions import all, any
 
+from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._array_api import __array_api_version__, __array_namespace_info__
 from ._clip import clip
 from ._constants import e, inf, nan, newaxis, pi
@@ -367,4 +368,7 @@
     "tensordot",
     "vecdot",
     "searchsorted",
+    "cumulative_logsumexp",
+    "cumulative_prod",
+    "cumulative_sum",
 ]
diff --git a/dpctl/tensor/_accumulation.py b/dpctl/tensor/_accumulation.py
new file mode 100644
index 0000000000..5232740b4f
--- /dev/null
+++ b/dpctl/tensor/_accumulation.py
@@ -0,0 +1,469 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2024 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from numpy.core.numeric import normalize_axis_index
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_accumulation_impl as tai
+import dpctl.tensor._tensor_impl as ti
+from dpctl.tensor._type_utils import _to_device_supported_dtype
+from dpctl.utils import ExecutionPlacementError
+
+
+def _default_accumulation_dtype(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "bi":
+        res_dt = dpt.dtype(ti.default_device_int_type(q))
+        if inp_dt.itemsize > res_dt.itemsize:
+            res_dt = inp_dt
+    elif inp_kind in "u":
+        res_dt = dpt.dtype(ti.default_device_int_type(q).upper())
+        res_ii = dpt.iinfo(res_dt)
+        inp_ii = dpt.iinfo(inp_dt)
+        if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
+            pass
+        else:
+            res_dt = inp_dt
+    elif inp_kind in "fc":
+        res_dt = inp_dt
+
+    return res_dt
+
+
+def _default_accumulation_dtype_fp_types(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    and the accumulation supports only floating-point data types
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "biu":
+        res_dt = dpt.dtype(ti.default_device_fp_type(q))
+        can_cast_v = dpt.can_cast(inp_dt, res_dt)
+        if not can_cast_v:
+            _fp64 = q.sycl_device.has_aspect_fp64
+            res_dt = dpt.float64 if _fp64 else dpt.float32
+    elif inp_kind in "f":
+        res_dt = inp_dt
+    elif inp_kind in "c":
+        raise ValueError("function not defined for complex types")
+
+    return res_dt
+
+
+def _accumulate_common(
+    x,
+    axis,
+    dtype,
+    include_initial,
+    out,
+    _accumulate_fn,
+    _accumulate_include_initial_fn,
+    _dtype_supported,
+    _default_accumulation_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    appended_axis = False
+    if x.ndim == 0:
+        x = x[dpt.newaxis]
+        appended_axis = True
+    nd = x.ndim
+    if axis is None:
+        if nd > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(nd)
+            )
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, nd, "axis")
+    sh = x.shape
+    res_sh = (
+        sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh
+    )
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_usm_type = x.usm_type
+    if dtype is None:
+        res_dt = _default_accumulation_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    # checking now avoids unnecessary allocations
+    implemented_types = _dtype_supported(inp_dt, res_dt)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined accumulation data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        out_sh = out.shape
+        # append an axis to `out` if scalar
+        if appended_axis and not include_initial:
+            out = out[dpt.newaxis, ...]
+            orig_out = out
+            final_res_sh = res_sh[1:]
+        else:
+            final_res_sh = res_sh
+        if not out_sh == final_res_sh:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_sh}, got {out_sh}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, " f"got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        # permute out array dims if necessary
+        if a1 != nd:
+            out = dpt.permute_dims(out, perm)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        if a1 != nd:
+            out = dpt.permute_dims(out, perm)
+
+    host_tasks_list = []
+    if implemented_types:
+        if not include_initial:
+            ht_e, acc_ev = _accumulate_fn(
+                src=arr,
+                trailing_dims_to_accumulate=1,
+                dst=out,
+                sycl_queue=q,
+            )
+        else:
+            ht_e, acc_ev = _accumulate_include_initial_fn(
+                src=arr,
+                dst=out,
+                sycl_queue=q,
+            )
+        host_tasks_list.append(ht_e)
+        if not (orig_out is None or out is orig_out):
+            # Copy the out data from temporary buffer to original memory
+            ht_e_cpy, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev]
+            )
+            host_tasks_list.append(ht_e_cpy)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt):
+            tmp = dpt.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q
+            )
+            host_tasks_list.append(ht_e_cpy)
+            if not include_initial:
+                ht_e, acc_ev = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, acc_ev = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+        else:
+            buf_dt = _default_accumulation_type_fn(inp_dt, q)
+            tmp = dpt.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q
+            )
+            tmp_res = dpt.empty(
+                res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            if a1 != nd:
+                tmp_res = dpt.permute_dims(tmp_res, perm)
+            host_tasks_list.append(ht_e_cpy)
+            if not include_initial:
+                ht_e, a_e = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, a_e = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            host_tasks_list.append(ht_e)
+            ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[a_e]
+            )
+            host_tasks_list.append(ht_e_cpy2)
+
+    if appended_axis:
+        out = dpt.squeeze(out)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt.permute_dims(out, inv_perm)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+
+    return out
+
+
+def cumulative_sum(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative sum of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis along which cumulative sum must be computed.
+            If `None`, the sum is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative sum.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative sums. The returned array has the data
+            type as described in the `dtype` parameter description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative sum is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative sums are computed
+            along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumsum_over_axis,
+        tai._cumsum_final_axis_include_initial,
+        tai._cumsum_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def cumulative_prod(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative product of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis along which cumulative product must be computed.
+            If `None`, the product is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative product.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided
+            axis in the output. Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative products. The returned array has
+            the data type as described in the `dtype` parameter description
+            above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative product is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative products are
+            computed along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumprod_over_axis,
+        tai._cumprod_final_axis_include_initial,
+        tai._cumprod_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def cumulative_logsumexp(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative logsmumexp of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis along which cumulative logsumexp must be computed.
+            If `None`, the logsumexp is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative logsumexp.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative logsumexp results. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative logsumexp is calculated, which will have size
+                  `N+1`
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumlogsumexp_over_axis,
+        tai._cumlogsumexp_final_axis_include_initial,
+        tai._cumlogsumexp_dtype_supported,
+        _default_accumulation_dtype_fp_types,
+    )
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index 8586ed935c..2494e130ab 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -164,6 +164,7 @@ def _reduction_over_axis(
                 sycl_queue=q,
                 depends=[cpy_e],
             )
+            host_tasks_list.append(ht_e_red)
             ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=tmp_res, dst=res, sycl_queue=q, depends=[r_e]
             )
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index dbd105ac95..18e0e1bc8a 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -32,7 +32,9 @@
 
 #include "dpctl_tensor_types.hpp"
 #include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
 
 namespace dpctl
 {
@@ -52,7 +54,7 @@ template <typename T> T ceiling_quotient(T n, T m)
 
 template <typename inputT, typename outputT> struct NonZeroIndicator
 {
-    NonZeroIndicator() {}
+    constexpr NonZeroIndicator() {}
 
     outputT operator()(const inputT &val) const
     {
@@ -66,7 +68,7 @@ template <typename inputT, typename outputT> struct NonZeroIndicator
 
 template <typename T> struct NoOpTransformer
 {
-    NoOpTransformer() {}
+    constexpr NoOpTransformer() {}
 
     T operator()(const T &val) const
     {
@@ -74,104 +76,238 @@ template <typename T> struct NoOpTransformer
     }
 };
 
+template <typename srcTy, typename dstTy> struct CastTransformer
+{
+    constexpr CastTransformer() {}
+
+    dstTy operator()(const srcTy &val) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(val);
+    }
+};
+
+template <typename BinOpT, typename T> struct can_use_inclusive_scan_over_group
+{
+    static constexpr bool value = sycl::has_known_identity<BinOpT, T>::value;
+};
+
+namespace
+{
+template <typename T> class stack_t
+{
+    T *src_;
+    size_t size_;
+    T *local_scans_;
+
+public:
+    stack_t() : src_{}, size_{}, local_scans_{} {}
+    stack_t(T *src, size_t sz, T *local_scans)
+        : src_(src), size_(sz), local_scans_(local_scans)
+    {
+    }
+    ~stack_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+};
+
+template <typename T> class stack_strided_t
+{
+    T *src_;
+    size_t size_;
+    T *local_scans_;
+    size_t local_stride_;
+
+public:
+    stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {}
+    stack_strided_t(T *src, size_t sz, T *local_scans, size_t local_stride)
+        : src_(src), size_(sz), local_scans_(local_scans),
+          local_stride_(local_stride)
+    {
+    }
+    ~stack_strided_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+
+    size_t get_local_stride() const
+    {
+        return local_stride_;
+    }
+};
+
+} // end of anonymous namespace
+
 // Iterative cumulative summation
 
+namespace su_ns = dpctl::tensor::sycl_utils;
+
 using nwiT = std::uint16_t;
 
 template <typename inputT,
           typename outputT,
           nwiT n_wi,
-          typename IndexerT,
-          typename TransformerT>
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
 class inclusive_scan_iter_local_scan_krn;
 
 template <typename inputT,
           typename outputT,
           nwiT n_wi,
-          typename IndexerT,
           typename TransformerT,
-          typename OtherIndexerT,
-          typename OtherTransformerT>
+          typename OtherTransformerT,
+          typename ScanOpT,
+          bool include_initial>
 class inclusive_scan_iter_chunk_update_krn;
 
 template <typename inputT,
           typename outputT,
           nwiT n_wi,
-          typename IndexerT,
-          typename TransformerT>
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
 sycl::event
 inclusive_scan_base_step(sycl::queue &exec_q,
                          const size_t wg_size,
-                         const size_t n_elems,
+                         const size_t iter_nelems,
+                         const size_t acc_nelems,
                          const inputT *input,
                          outputT *output,
                          const size_t s0,
                          const size_t s1,
-                         const IndexerT &indexer,
-                         const TransformerT &transformer,
-                         size_t &n_groups,
+                         const IterIndexerT &iter_indexer,
+                         const InpIndexerT &inp_indexer,
+                         const OutIndexerT &out_indexer,
+                         TransformerT transformer,
+                         const ScanOpT &scan_op,
+                         outputT identity,
+                         size_t &acc_groups,
                          const std::vector<sycl::event> &depends = {})
 {
-    n_groups = ceiling_quotient<size_t>(n_elems, n_wi * wg_size);
+    acc_groups = ceiling_quotient<size_t>(acc_nelems, n_wi * wg_size);
 
     sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
         using slmT = sycl::local_accessor<outputT, 1>;
 
+        auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size);
         auto lws = sycl::range<1>(wg_size);
-        auto gws = sycl::range<1>(n_groups * wg_size);
 
         auto ndRange = sycl::nd_range<1>(gws, lws);
 
         slmT slm_iscan_tmp(lws, cgh);
 
-        using KernelName =
-            inclusive_scan_iter_local_scan_krn<inputT, outputT, n_wi, IndexerT,
-                                               TransformerT>;
+        using KernelName = inclusive_scan_iter_local_scan_krn<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>;
 
         cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
                                                       std::move(slm_iscan_tmp)](
                                                   sycl::nd_item<1> it) {
-            auto chunk_gid = it.get_global_id(0);
-            auto lid = it.get_local_id(0);
+            const size_t gid = it.get_global_id(0);
+            const size_t lid = it.get_local_id(0);
+
+            const size_t iter_gid = gid / (acc_groups * wg_size);
+            const size_t chunk_gid = gid - (iter_gid * acc_groups * wg_size);
 
-            std::array<outputT, n_wi> local_isum;
+            std::array<outputT, n_wi> local_iscan;
 
             size_t i = chunk_gid * n_wi;
+            const auto &iter_offsets = iter_indexer(iter_gid);
+            const auto &inp_iter_offset = iter_offsets.get_first_offset();
+            const auto &out_iter_offset = iter_offsets.get_second_offset();
 
 #pragma unroll
             for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                constexpr outputT out_zero(0);
-
-                local_isum[m_wi] =
-                    (i + m_wi < n_elems)
-                        ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
-                        : out_zero;
+                if constexpr (!include_initial) {
+                    local_iscan[m_wi] =
+                        (i + m_wi < acc_nelems)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer(s0 + s1 * (i + m_wi))])
+                            : identity;
+                }
+                else {
+                    // shift input to the left by a single element relative to
+                    // output
+                    local_iscan[m_wi] =
+                        (i + m_wi < acc_nelems && i + m_wi > 0)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer((s0 + s1 * (i + m_wi)) -
+                                                    1)])
+                            : identity;
+                }
             }
 
 #pragma unroll
             for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
-                local_isum[m_wi] += local_isum[m_wi - 1];
+                local_iscan[m_wi] =
+                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
             }
-            // local_isum is now result of
+            // local_iscan is now result of
             // inclusive scan of locally stored inputs
 
-            outputT wg_iscan_val = sycl::inclusive_scan_over_group(
-                it.get_group(), local_isum.back(), sycl::plus<outputT>(),
-                outputT(0));
+            outputT wg_iscan_val;
+            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
+                                                            outputT>::value) {
+                wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_iscan.back(), scan_op, identity);
+            }
+            else {
+                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
+                    it.get_group(), slm_iscan_tmp, local_iscan.back(), scan_op);
+                // ensure all finished reading from SLM, to avoid race condition
+                // with subsequent writes into SLM
+                it.barrier(sycl::access::fence_space::local_space);
+            }
 
             slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
             it.barrier(sycl::access::fence_space::local_space);
-            outputT addand = (lid == 0) ? outputT(0) : slm_iscan_tmp[lid];
+            outputT addand = (lid == 0) ? identity : slm_iscan_tmp[lid];
 
 #pragma unroll
             for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                local_isum[m_wi] += addand;
+                local_iscan[m_wi] = scan_op(local_iscan[m_wi], addand);
             }
 
-            for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < n_elems); ++m_wi) {
-                output[i + m_wi] = local_isum[m_wi];
+            for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < acc_nelems);
+                 ++m_wi) {
+                output[out_iter_offset + out_indexer(i + m_wi)] =
+                    local_iscan[m_wi];
             }
         });
     });
@@ -179,75 +315,62 @@ inclusive_scan_base_step(sycl::queue &exec_q,
     return inc_scan_phase1_ev;
 }
 
-namespace
-{
-template <typename T> class stack_t
-{
-    T *src_;
-    size_t size_;
-    T *local_scans_;
-
-public:
-    stack_t() : src_{}, size_{}, local_scans_{} {}
-    stack_t(T *src, size_t sz, T *local_scans)
-        : src_(src), size_(sz), local_scans_(local_scans)
-    {
-    }
-    ~stack_t(){};
-
-    T *get_src_ptr() const
-    {
-        return src_;
-    }
-
-    const T *get_src_const_ptr() const
-    {
-        return src_;
-    }
-
-    size_t get_size() const
-    {
-        return size_;
-    }
-
-    T *get_local_scans_ptr() const
-    {
-        return local_scans_;
-    }
-};
-} // end of anonymous namespace
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename TransformerT,
+          typename OtherIndexerT,
+          typename OtherTransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_1d_iter_chunk_update_krn;
 
 /*
- * for integer type maskT,
- *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
  * for 0 <= j < n_elems
  */
 template <typename inputT,
           typename outputT,
           nwiT n_wi,
           typename IndexerT,
-          typename TransformerT>
-sycl::event inclusive_scan_iter(sycl::queue &exec_q,
-                                const size_t wg_size,
-                                const size_t n_elems,
-                                const inputT *input,
-                                outputT *output,
-                                const size_t s0,
-                                const size_t s1,
-                                const IndexerT &indexer,
-                                const TransformerT &transformer,
-                                std::vector<sycl::event> &host_tasks,
-                                const std::vector<sycl::event> &depends = {})
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
+                                   const size_t wg_size,
+                                   const size_t n_elems,
+                                   const inputT *input,
+                                   outputT *output,
+                                   const size_t s0,
+                                   const size_t s1,
+                                   const IndexerT &indexer,
+                                   const TransformerT &transformer,
+                                   std::vector<sycl::event> &host_tasks,
+                                   const std::vector<sycl::event> &depends = {})
 {
+    ScanOpT scan_op = ScanOpT();
+    constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
+
+    constexpr size_t _iter_nelems = 1;
+
+    using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
+    constexpr IterIndexerT _no_op_iter_indexer{};
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    constexpr NoOpIndexerT _no_op_indexer{};
+
     size_t n_groups;
     sycl::event inc_scan_phase1_ev =
-        inclusive_scan_base_step<inputT, outputT, n_wi, IndexerT, TransformerT>(
-            exec_q, wg_size, n_elems, input, output, s0, s1, indexer,
-            transformer, n_groups, depends);
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT, IndexerT,
+                                 NoOpIndexerT, TransformerT, ScanOpT,
+                                 include_initial>(
+            exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1,
+            _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op,
+            identity, n_groups, depends);
 
     sycl::event dependent_event = inc_scan_phase1_ev;
     if (n_groups > 1) {
-        auto chunk_size = wg_size * n_wi;
+        const size_t chunk_size = wg_size * n_wi;
 
         // how much of temporary allocation do we need
         size_t n_groups_ = n_groups;
@@ -272,19 +395,20 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
         outputT *src = output;
         outputT *local_scans = temp;
 
-        NoOpIndexer _no_op_indexer{};
         using NoOpTransformerT = NoOpTransformer<outputT>;
-        NoOpTransformerT _no_op_transformer{};
+        constexpr NoOpTransformerT _no_op_transformer{};
         size_t size_to_update = n_elems;
         while (n_groups_ > 1) {
 
-            size_t src_size = n_groups_ - 1;
+            const size_t src_size = n_groups_ - 1;
             dependent_event =
-                inclusive_scan_base_step<outputT, outputT, n_wi, NoOpIndexer,
-                                         NoOpTransformerT>(
-                    exec_q, wg_size, src_size, src, local_scans, chunk_size - 1,
-                    chunk_size, _no_op_indexer, _no_op_transformer,
-                    n_groups_, // n_groups_ is modified in place
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, _iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, _no_op_iter_indexer,
+                    _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op,
+                    identity, n_groups_, // n_groups_ is modified in place
                     {dependent_event});
             stack.push_back({src, size_to_update, local_scans});
             src = local_scans;
@@ -295,43 +419,469 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
         for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
              ++reverse_stack_id)
         {
-            auto stack_id = stack.size() - 1 - reverse_stack_id;
+            const size_t stack_id = stack.size() - 1 - reverse_stack_id;
 
-            auto stack_elem = stack[stack_id];
+            const auto &stack_elem = stack[stack_id];
             outputT *src = stack_elem.get_src_ptr();
-            size_t src_size = stack_elem.get_size();
+            const size_t src_size = stack_elem.get_size();
             outputT *local_scans = stack_elem.get_local_scans_ptr();
 
             // output[ chunk_size * (i + 1) + j] += temp[i]
             dependent_event = exec_q.submit([&](sycl::handler &cgh) {
                 cgh.depends_on(dependent_event);
 
+                constexpr nwiT updates_per_wi = n_wi;
+                const size_t n_items = ceiling_quotient<size_t>(src_size, n_wi);
+
                 using UpdateKernelName =
-                    class inclusive_scan_iter_chunk_update_krn<
+                    class inclusive_scan_1d_iter_chunk_update_krn<
                         inputT, outputT, n_wi, IndexerT, TransformerT,
-                        NoOpIndexer, NoOpTransformerT>;
+                        NoOpTransformerT, ScanOpT, include_initial>;
+
+                cgh.parallel_for<UpdateKernelName>(
+                    {n_items}, [chunk_size, src, src_size, local_scans, scan_op,
+                                identity](auto wiid) {
+                        const size_t gid = n_wi * wiid[0];
+#pragma unroll
+                        for (size_t i = 0; i < updates_per_wi; ++i) {
+                            const size_t src_id = gid + i;
+                            if (src_id < src_size) {
+                                const size_t scan_id = (src_id / chunk_size);
+                                src[src_id] =
+                                    (scan_id > 0)
+                                        ? scan_op(src[src_id],
+                                                  local_scans[scan_id - 1])
+                                        : scan_op(src[src_id], identity);
+                            }
+                        }
+                    });
+            });
+        }
+
+        sycl::event free_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_event);
+            const auto &ctx = exec_q.get_context();
+            cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
+        });
+        host_tasks.push_back(free_ev);
+    }
+
+    return dependent_event;
+}
+
+typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+accumulate_1d_contig_impl(sycl::queue &q,
+                          size_t n_elems,
+                          const char *src,
+                          char *dst,
+                          std::vector<sycl::event> &host_tasks,
+                          const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    constexpr NoOpIndexerT flat_indexer{};
+    constexpr transformerT transformer{};
+
+    constexpr size_t s0 = 0;
+    constexpr size_t s1 = 1;
+
+    sycl::event comp_ev;
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        constexpr nwiT n_wi_for_cpu = 8;
+        const size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        constexpr nwiT n_wi_for_gpu = 4;
+        const size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    return comp_ev;
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename IndexerT,
+          typename TransformerT,
+          typename OtherTransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_final_chunk_update_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename InpIterIndexerT,
+          typename OutIterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter(sycl::queue &exec_q,
+                                const size_t wg_size,
+                                const size_t iter_nelems,
+                                const size_t acc_nelems,
+                                const inputT *input,
+                                outputT *output,
+                                const size_t s0,
+                                const size_t s1,
+                                const InpIterIndexerT &inp_iter_indexer,
+                                const OutIterIndexerT &out_iter_indexer,
+                                const InpIndexerT &inp_indexer,
+                                const OutIndexerT &out_indexer,
+                                const TransformerT &transformer,
+                                std::vector<sycl::event> &host_tasks,
+                                const std::vector<sycl::event> &depends = {})
+{
+    ScanOpT scan_op = ScanOpT();
+    constexpr outputT identity = su_ns::Identity<ScanOpT, outputT>::value;
+
+    using IterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+            InpIterIndexerT, OutIterIndexerT>;
+    const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer};
+
+    size_t acc_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT,
+                                 InpIndexerT, OutIndexerT, TransformerT,
+                                 ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (acc_groups > 1) {
+        const size_t chunk_size = wg_size * n_wi;
+
+        // how much of temporary allocation do we need
+        size_t acc_groups_ = acc_groups;
+        size_t temp_size = 0;
+        while (acc_groups_ > 1) {
+            const size_t this_size = (acc_groups_ - 1);
+            temp_size += this_size;
+            acc_groups_ = ceiling_quotient<size_t>(this_size, chunk_size);
+        }
+
+        // allocate
+        outputT *temp =
+            sycl::malloc_device<outputT>(iter_nelems * temp_size, exec_q);
+
+        if (!temp) {
+            throw std::bad_alloc();
+        }
+
+        std::vector<stack_strided_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        acc_groups_ = acc_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        constexpr NoOpIndexerT _no_op_indexer{};
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        constexpr NoOpTransformerT _no_op_transformer{};
+        size_t size_to_update = acc_nelems;
+
+        {
+            size_t src_size = acc_groups - 1;
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan_iter_indexer{
+                0, static_cast<ssize_t>(iter_nelems),
+                static_cast<ssize_t>(src_size)};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    OutIterIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{out_iter_indexer,
+                                             scan_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         OutIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, out_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        while (acc_groups_ > 1) {
+            size_t src_size = acc_groups_ - 1;
+
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan1_iter_indexer{
+                0, static_cast<ssize_t>(iter_nelems),
+                static_cast<ssize_t>(size_to_update)};
+            const LocalScanIndexerT scan2_iter_indexer{
+                0, static_cast<ssize_t>(iter_nelems),
+                static_cast<ssize_t>(src_size)};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    LocalScanIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{scan1_iter_indexer,
+                                             scan2_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size() - 1;
+             ++reverse_stack_id)
+        {
+            const size_t stack_id = stack.size() - 1 - reverse_stack_id;
+
+            const auto &stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            size_t local_stride = stack_elem.get_local_stride();
+
+            constexpr nwiT updates_per_wi = n_wi;
+            const size_t update_nelems =
+                ceiling_quotient<size_t>(src_size, updates_per_wi);
+
+            dependent_event = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(dependent_event);
+
+                using UpdateKernelName =
+                    class inclusive_scan_iter_chunk_update_krn<
+                        inputT, outputT, n_wi, TransformerT, NoOpTransformerT,
+                        ScanOpT, include_initial>;
+
+                cgh.parallel_for<UpdateKernelName>(
+                    {iter_nelems * update_nelems},
+                    [chunk_size, update_nelems, src_size, local_stride, src,
+                     local_scans, scan_op, identity](auto wiid) {
+                        const size_t gid = wiid[0];
+
+                        const size_t iter_gid = gid / update_nelems;
+                        const size_t axis_gid =
+                            gid - (iter_gid * update_nelems);
+
+                        const size_t src_axis_id0 = axis_gid * updates_per_wi;
+                        const size_t src_iter_id = iter_gid * src_size;
+#pragma unroll
+                        for (nwiT i = 0; i < updates_per_wi; ++i) {
+                            const size_t src_axis_id = src_axis_id0 + i;
+                            const size_t src_id = src_axis_id + src_iter_id;
+
+                            if (src_axis_id < src_size) {
+                                const size_t scan_axis_id =
+                                    src_axis_id / chunk_size;
+                                const size_t scan_id =
+                                    scan_axis_id + iter_gid * local_stride;
+
+                                src[src_id] =
+                                    (scan_axis_id > 0)
+                                        ? scan_op(src[src_id],
+                                                  local_scans[scan_id - 1])
+                                        : scan_op(src[src_id], identity);
+                            }
+                        }
+                    });
+            });
+        }
+
+        // last stack element is always directly to output
+        {
+            const auto &stack_elem = stack[0];
+            outputT *src = stack_elem.get_src_ptr();
+            const size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            const size_t local_stride = stack_elem.get_local_stride();
+
+            constexpr nwiT updates_per_wi = n_wi;
+            const size_t update_nelems =
+                ceiling_quotient<size_t>(src_size, updates_per_wi);
+
+            dependent_event = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(dependent_event);
+
+                using UpdateKernelName =
+                    class inclusive_scan_final_chunk_update_krn<
+                        inputT, outputT, n_wi, OutIterIndexerT, OutIndexerT,
+                        TransformerT, NoOpTransformerT, ScanOpT,
+                        include_initial>;
 
                 cgh.parallel_for<UpdateKernelName>(
-                    {src_size}, [chunk_size, src, local_scans](auto wiid) {
-                        auto gid = wiid[0];
-                        auto i = (gid / chunk_size);
-                        src[gid] += (i > 0) ? local_scans[i - 1] : outputT(0);
+                    {iter_nelems * update_nelems},
+                    [chunk_size, update_nelems, src_size, local_stride, src,
+                     local_scans, scan_op, identity, out_iter_indexer,
+                     out_indexer](auto wiid) {
+                        const size_t gid = wiid[0];
+
+                        const size_t iter_gid = gid / update_nelems;
+                        const size_t axis_gid =
+                            gid - (iter_gid * update_nelems);
+
+                        const size_t src_axis_id0 = axis_gid * updates_per_wi;
+                        const size_t src_iter_id = out_iter_indexer(iter_gid);
+#pragma unroll
+                        for (nwiT i = 0; i < updates_per_wi; ++i) {
+                            const size_t src_axis_id = src_axis_id0 + i;
+                            const size_t src_id =
+                                out_indexer(src_axis_id) + src_iter_id;
+
+                            if (src_axis_id < src_size) {
+                                const size_t scan_axis_id =
+                                    src_axis_id / chunk_size;
+                                const size_t scan_id =
+                                    scan_axis_id + iter_gid * local_stride;
+
+                                src[src_id] =
+                                    (scan_axis_id > 0)
+                                        ? scan_op(src[src_id],
+                                                  local_scans[scan_id - 1])
+                                        : scan_op(src[src_id], identity);
+                            }
+                        }
                     });
             });
         }
 
-        sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) {
+        sycl::event free_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(dependent_event);
             const auto &ctx = exec_q.get_context();
             cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
         });
-        host_tasks.push_back(e4);
+        host_tasks.push_back(free_ev);
     }
 
     return dependent_event;
 }
 
-typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
+typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    size_t,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+accumulate_strided_impl(sycl::queue &q,
+                        size_t iter_nelems,
+                        size_t acc_nelems,
+                        const char *src,
+                        int iter_nd,
+                        const ssize_t *iter_shape_strides,
+                        ssize_t inp_iter_offset,
+                        ssize_t out_iter_offset,
+                        int acc_nd,
+                        const ssize_t *acc_shape_strides,
+                        char *dst,
+                        std::vector<sycl::event> &host_tasks,
+                        const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides};
+    const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset,
+                                       iter_shape_strides};
+
+    using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+    const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides,
+                                       acc_shape_strides + 2 * acc_nd};
+    const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset,
+                                       iter_shape_strides,
+                                       iter_shape_strides + 2 * iter_nd};
+
+    constexpr transformerT transformer{};
+
+    constexpr size_t s0 = 0;
+    constexpr size_t s1 = 1;
+
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        constexpr nwiT n_wi_for_cpu = 8;
+        const size_t wg_size = 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        constexpr nwiT n_wi_for_gpu = 4;
+        const size_t wg_size = 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+
+    return comp_ev;
+}
+
+typedef size_t (*cumsum_val_contig_impl_fn_ptr_t)(
     sycl::queue &,
     size_t,
     const char *,
@@ -340,7 +890,7 @@ typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
     const std::vector<sycl::event> &);
 
 template <typename maskT, typename cumsumT, typename transformerT>
-size_t accumulate_contig_impl(sycl::queue &q,
+size_t cumsum_val_contig_impl(sycl::queue &q,
                               size_t n_elems,
                               const char *mask,
                               char *cumsum,
@@ -350,29 +900,34 @@ size_t accumulate_contig_impl(sycl::queue &q,
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
 
-    NoOpIndexer flat_indexer{};
-    transformerT non_zero_indicator{};
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    constexpr NoOpIndexerT flat_indexer{};
+    constexpr transformerT transformer{};
 
-    sycl::event comp_ev;
+    constexpr size_t s0 = 0;
+    constexpr size_t s1 = 1;
+    constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
 
+    sycl::event comp_ev;
     const sycl::device &dev = q.get_device();
     if (dev.has(sycl::aspect::cpu)) {
         constexpr nwiT n_wi_for_cpu = 8;
-        size_t wg_size = 256;
-        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_cpu,
-                                      decltype(flat_indexer),
-                                      decltype(non_zero_indicator)>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            flat_indexer, non_zero_indicator, host_tasks, depends);
+        const size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
     }
     else {
         constexpr nwiT n_wi_for_gpu = 4;
-        size_t wg_size = 256;
-        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_gpu,
-                                      decltype(flat_indexer),
-                                      decltype(non_zero_indicator)>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            flat_indexer, non_zero_indicator, host_tasks, depends);
+        const size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
     }
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
 
@@ -398,7 +953,7 @@ template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt32
     {
         using cumsumT = std::int32_t;
         fnT fn =
-            accumulate_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
         return fn;
     }
 };
@@ -409,7 +964,7 @@ template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt64
     {
         using cumsumT = std::int64_t;
         fnT fn =
-            accumulate_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
         return fn;
     }
 };
@@ -421,7 +976,7 @@ template <typename fnT, typename T> struct Cumsum1DContigFactory
         if constexpr (std::is_integral_v<T>) {
             using cumsumT = std::int64_t;
             fnT fn =
-                accumulate_contig_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+                cumsum_val_contig_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
             return fn;
         }
         else {
@@ -430,7 +985,7 @@ template <typename fnT, typename T> struct Cumsum1DContigFactory
     }
 };
 
-typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
+typedef size_t (*cumsum_val_strided_impl_fn_ptr_t)(
     sycl::queue &,
     size_t,
     const char *,
@@ -441,7 +996,7 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
     const std::vector<sycl::event> &);
 
 template <typename maskT, typename cumsumT, typename transformerT>
-size_t accumulate_strided_impl(sycl::queue &q,
+size_t cumsum_val_strided_impl(sycl::queue &q,
                                size_t n_elems,
                                const char *mask,
                                int nd,
@@ -453,28 +1008,34 @@ size_t accumulate_strided_impl(sycl::queue &q,
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
 
-    StridedIndexer strided_indexer{nd, 0, shape_strides};
-    transformerT non_zero_indicator{};
+    using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexerT strided_indexer{nd, 0, shape_strides};
+    constexpr transformerT transformer{};
+
+    constexpr size_t s0 = 0;
+    constexpr size_t s1 = 1;
+    constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
 
     const sycl::device &dev = q.get_device();
     sycl::event comp_ev;
     if (dev.has(sycl::aspect::cpu)) {
         constexpr nwiT n_wi_for_cpu = 8;
-        size_t wg_size = 256;
-        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_cpu,
-                                      decltype(strided_indexer),
-                                      decltype(non_zero_indicator)>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            strided_indexer, non_zero_indicator, host_tasks, depends);
+        const size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
     }
     else {
         constexpr nwiT n_wi_for_gpu = 4;
-        size_t wg_size = 256;
-        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_gpu,
-                                      decltype(strided_indexer),
-                                      decltype(non_zero_indicator)>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            strided_indexer, non_zero_indicator, host_tasks, depends);
+        const size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
     }
 
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
@@ -501,7 +1062,7 @@ template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt32
     {
         using cumsumT = std::int32_t;
         fnT fn =
-            accumulate_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
         return fn;
     }
 };
@@ -512,7 +1073,7 @@ template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt64
     {
         using cumsumT = std::int64_t;
         fnT fn =
-            accumulate_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
         return fn;
     }
 };
@@ -524,7 +1085,7 @@ template <typename fnT, typename T> struct Cumsum1DStridedFactory
         if constexpr (std::is_integral_v<T>) {
             using cumsumT = std::int64_t;
             fnT fn =
-                accumulate_strided_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+                cumsum_val_strided_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
             return fn;
         }
         else {
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 6301b3b9bc..75ab1d9341 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -25,6 +25,7 @@
 #pragma once
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <sycl/sycl.hpp>
 #include <type_traits>
 #include <vector>
@@ -154,6 +155,31 @@ T custom_reduce_over_group(const GroupT &wg,
     return sycl::group_broadcast(wg, red_val_over_wg);
 }
 
+template <typename T, typename GroupT, typename LocAccT, typename OpT>
+T custom_inclusive_scan_over_group(const GroupT &wg,
+                                   LocAccT local_mem_acc,
+                                   const T local_val,
+                                   const OpT &op)
+{
+    const std::uint32_t local_id = wg.get_local_id(0);
+    const std::uint32_t wgs = wg.get_local_range(0);
+    local_mem_acc[local_id] = local_val;
+
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    if (wg.leader()) {
+        T scan_val = local_mem_acc[0];
+        for (std::uint32_t i = 1; i < wgs; ++i) {
+            scan_val = op(local_mem_acc[i], scan_val);
+            local_mem_acc[i] = scan_val;
+        }
+    }
+
+    // ensure all work-items see the same SLM that leader updated
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+    return local_mem_acc[local_id];
+}
+
 // Reduction functors
 
 // Maximum
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
index b82154558d..3a5a93620e 100644
--- a/dpctl/tensor/libtensor/source/accumulators.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -49,23 +49,23 @@ namespace py_internal
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t;
-static accumulate_contig_impl_fn_ptr_t
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
     mask_positions_contig_i64_dispatch_vector[td_ns::num_types];
-static accumulate_contig_impl_fn_ptr_t
+static cumsum_val_contig_impl_fn_ptr_t
     mask_positions_contig_i32_dispatch_vector[td_ns::num_types];
 
-using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
-static accumulate_strided_impl_fn_ptr_t
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
     mask_positions_strided_i64_dispatch_vector[td_ns::num_types];
-static accumulate_strided_impl_fn_ptr_t
+static cumsum_val_strided_impl_fn_ptr_t
     mask_positions_strided_i32_dispatch_vector[td_ns::num_types];
 
 void populate_mask_positions_dispatch_vectors(void)
 {
     using dpctl::tensor::kernels::accumulators::
         MaskPositionsContigFactoryForInt64;
-    td_ns::DispatchVectorBuilder<accumulate_contig_impl_fn_ptr_t,
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
                                  MaskPositionsContigFactoryForInt64,
                                  td_ns::num_types>
         dvb1;
@@ -73,7 +73,7 @@ void populate_mask_positions_dispatch_vectors(void)
 
     using dpctl::tensor::kernels::accumulators::
         MaskPositionsContigFactoryForInt32;
-    td_ns::DispatchVectorBuilder<accumulate_contig_impl_fn_ptr_t,
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
                                  MaskPositionsContigFactoryForInt32,
                                  td_ns::num_types>
         dvb2;
@@ -81,7 +81,7 @@ void populate_mask_positions_dispatch_vectors(void)
 
     using dpctl::tensor::kernels::accumulators::
         MaskPositionsStridedFactoryForInt64;
-    td_ns::DispatchVectorBuilder<accumulate_strided_impl_fn_ptr_t,
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
                                  MaskPositionsStridedFactoryForInt64,
                                  td_ns::num_types>
         dvb3;
@@ -89,7 +89,7 @@ void populate_mask_positions_dispatch_vectors(void)
 
     using dpctl::tensor::kernels::accumulators::
         MaskPositionsStridedFactoryForInt32;
-    td_ns::DispatchVectorBuilder<accumulate_strided_impl_fn_ptr_t,
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
                                  MaskPositionsStridedFactoryForInt32,
                                  td_ns::num_types>
         dvb4;
@@ -226,23 +226,23 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     return total_set;
 }
 
-using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
-static accumulate_strided_impl_fn_ptr_t
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
     cumsum_1d_strided_dispatch_vector[td_ns::num_types];
-using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t;
-static accumulate_contig_impl_fn_ptr_t
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
     cumsum_1d_contig_dispatch_vector[td_ns::num_types];
 
 void populate_cumsum_1d_dispatch_vectors(void)
 {
     using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory;
-    td_ns::DispatchVectorBuilder<accumulate_contig_impl_fn_ptr_t,
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
                                  Cumsum1DContigFactory, td_ns::num_types>
         dvb1;
     dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector);
 
     using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory;
-    td_ns::DispatchVectorBuilder<accumulate_strided_impl_fn_ptr_t,
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
                                  Cumsum1DStridedFactory, td_ns::num_types>
         dvb2;
     dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector);
diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
new file mode 100644
index 0000000000..b051c1703d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -0,0 +1,457 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <cstdint>
+#include <limits>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event>
+py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src,
+                        const int trailing_dims_to_accumulate,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        sycl::queue &exec_q,
+                        std::vector<sycl::event> const &depends,
+                        const strided_fnT &strided_dispatch_table,
+                        const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iter_nd = src_nd - trailing_dims_to_accumulate;
+    if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) {
+        throw py::value_error(
+            "trailing_dims_to_accumulate must be positive, but no "
+            "greater than rank of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i);
+        acc_nelems *= static_cast<size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    int acc_nd = trailing_dims_to_accumulate;
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple);
+    if (packed_shapes_and_strides == nullptr) {
+        throw std::runtime_error("Unexpected error");
+    }
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+
+    py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(acc_ev);
+        const auto &ctx = exec_q.get_context();
+        cgh.host_task([ctx, packed_shapes_and_strides] {
+            sycl::free(packed_shapes_and_strides, ctx);
+        });
+    });
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    std::vector<sycl::event> const &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+
+    constexpr int acc_nd = 1;
+
+    int iter_nd = src_nd - acc_nd;
+    if (iter_nd < 0) {
+        throw py::value_error("accumulation axis must not be greater than rank "
+                              "of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i);
+        acc_nelems *= static_cast<size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple);
+    if (packed_shapes_and_strides == nullptr) {
+        throw std::runtime_error("Unexpected error");
+    }
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+
+    py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(acc_ev);
+        const auto &ctx = exec_q.get_context();
+        cgh.host_task([ctx, packed_shapes_and_strides] {
+            sycl::free(packed_shapes_and_strides, ctx);
+        });
+    });
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+/*! @brief Template implementing Python API for querying accumulation
+ * type support */
+template <typename fnT>
+bool py_accumulate_dtype_supported(const py::dtype &input_dtype,
+                                   const py::dtype &output_dtype,
+                                   const fnT &dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    fn = dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp
new file mode 100644
index 0000000000..4b7de25df7
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp
@@ -0,0 +1,50 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "cumulative_logsumexp.hpp"
+#include "cumulative_prod.hpp"
+#include "cumulative_sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+/*! @brief Add accumulators to Python module */
+void init_accumulator_functions(py::module_ m)
+{
+    init_cumulative_logsumexp(m);
+    init_cumulative_prod(m);
+    init_cumulative_sum(m);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp
new file mode 100644
index 0000000000..a61eb3a63a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_accumulator_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
new file mode 100644
index 0000000000..4cf2ec6fe9
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -0,0 +1,344 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                       [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        // disjunction is C++17 feature, supported by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumlogsumexp_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumlogsumexp_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumlogsumexp_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumlogsumexp_dispatch_tables;
+    populate_cumlogsumexp_dispatch_tables();
+
+    using impl::cumlogsumexp_1d_contig_dispatch_table;
+    using impl::cumlogsumexp_strided_dispatch_table;
+    auto cumlogsumexp_pyapi = [&](const arrayT &src,
+                                  int trailing_dims_to_accumulate,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+        using dpctl::tensor::py_internal::py_accumulate_over_axis;
+        return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst,
+                                       exec_q, depends,
+                                       cumlogsumexp_strided_dispatch_table,
+                                       cumlogsumexp_1d_contig_dispatch_table);
+    };
+    m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table;
+    using impl::cumlogsumexp_include_initial_strided_dispatch_table;
+    auto cumlogsumexp_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::
+                py_accumulate_final_axis_include_initial;
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumlogsumexp_include_initial_strided_dispatch_table,
+                cumlogsumexp_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumlogsumexp_final_axis_include_initial",
+          cumlogsumexp_include_initial_pyapi, "", py::arg("src"),
+          py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                            const py::dtype &output_dtype) {
+        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
+        return py_accumulate_dtype_supported(
+            input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table);
+    };
+    m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
new file mode 100644
index 0000000000..991508d5d7
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_cumulative_logsumexp(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
new file mode 100644
index 0000000000..7982e0ff41
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -0,0 +1,348 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProdAccumulation
+{
+
+    // disjunction is C++17 feature, supported by DPC++ input bool
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::multiplies<dstTy>;
+            constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::multiplies<dstTy>;
+            constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::multiplies<dstTy>;
+            constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::multiplies<dstTy>;
+            constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumprod_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumprod_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumprod_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumprod_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumprod_dispatch_tables;
+    populate_cumprod_dispatch_tables();
+
+    using impl::cumprod_1d_contig_dispatch_table;
+    using impl::cumprod_strided_dispatch_table;
+    auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+        using dpctl::tensor::py_internal::py_accumulate_over_axis;
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table);
+    };
+    m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumprod_1d_include_initial_contig_dispatch_table;
+    using impl::cumprod_include_initial_strided_dispatch_table;
+    auto cumprod_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::
+                py_accumulate_final_axis_include_initial;
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumprod_include_initial_strided_dispatch_table,
+                cumprod_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumprod_dtype_supported = [&](const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype) {
+        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumprod_strided_dispatch_table);
+    };
+    m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp
new file mode 100644
index 0000000000..b9cced2879
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_cumulative_prod(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
new file mode 100644
index 0000000000..59e5830fe6
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -0,0 +1,347 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumAccumulation
+{
+
+    // disjunction is C++17 feature, supported by DPC++ input bool
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::plus<dstTy>;
+            constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::plus<dstTy>;
+            constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::plus<dstTy>;
+            constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = sycl::plus<dstTy>;
+            constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumsum_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumsum_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumsum_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumsum_dispatch_tables;
+    populate_cumsum_dispatch_tables();
+
+    using impl::cumsum_1d_contig_dispatch_table;
+    using impl::cumsum_strided_dispatch_table;
+    auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                            const arrayT &dst, sycl::queue &exec_q,
+                            const event_vecT &depends = {}) {
+        using dpctl::tensor::py_internal::py_accumulate_over_axis;
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table);
+    };
+    m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumsum_1d_include_initial_contig_dispatch_table;
+    using impl::cumsum_include_initial_strided_dispatch_table;
+    auto cumsum_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::
+                py_accumulate_final_axis_include_initial;
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumsum_include_initial_strided_dispatch_table,
+                cumsum_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumsum_dtype_supported = [&](const py::dtype &input_dtype,
+                                      const py::dtype &output_dtype) {
+        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumsum_strided_dispatch_table);
+    };
+    m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp
new file mode 100644
index 0000000000..68a4d5d282
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp
@@ -0,0 +1,41 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_cumulative_sum(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp
index f3f8f0b9f1..1e0be8adb3 100644
--- a/dpctl/tensor/libtensor/source/repeat.cpp
+++ b/dpctl/tensor/libtensor/source/repeat.cpp
@@ -296,9 +296,9 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
         assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
                           orthog_dst_shape.begin()));
 
-        std::vector<py::ssize_t> simplified_orthog_shape;
-        std::vector<py::ssize_t> simplified_orthog_src_strides;
-        std::vector<py::ssize_t> simplified_orthog_dst_strides;
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
 
         const py::ssize_t *_shape = orthog_src_shape.data();
 
@@ -671,9 +671,9 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
         assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
                           orthog_dst_shape.begin()));
 
-        std::vector<py::ssize_t> simplified_orthog_shape;
-        std::vector<py::ssize_t> simplified_orthog_src_strides;
-        std::vector<py::ssize_t> simplified_orthog_dst_strides;
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
 
         const py::ssize_t *_shape = orthog_src_shape.data();
 
diff --git a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp
new file mode 100644
index 0000000000..9aaa9bbc38
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp
@@ -0,0 +1,35 @@
+//===-- tensor_accumulation.cpp -                              --*-C++-*-/===//
+//   Implementation of _tensor_accumulation_impl module
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "accumulators/accumulators_common.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_accumulation_impl, m)
+{
+    dpctl::tensor::py_internal::init_accumulator_functions(m);
+}
diff --git a/dpctl/tests/test_tensor_accumulation.py b/dpctl/tests/test_tensor_accumulation.py
new file mode 100644
index 0000000000..97194ed5eb
--- /dev/null
+++ b/dpctl/tests/test_tensor_accumulation.py
@@ -0,0 +1,411 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from random import randrange
+
+import pytest
+from helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError
+
+sint_types = [
+    dpt.int8,
+    dpt.int16,
+    dpt.int32,
+    dpt.int64,
+]
+uint_types = [
+    dpt.uint8,
+    dpt.uint16,
+    dpt.uint32,
+    dpt.uint64,
+]
+rfp_types = [
+    dpt.float16,
+    dpt.float32,
+    dpt.float64,
+]
+cfp_types = [
+    dpt.complex64,
+    dpt.complex128,
+]
+
+no_complex_types = [dpt.bool] + sint_types + uint_types + rfp_types
+
+all_types = [dpt.bool] + sint_types + uint_types + rfp_types + cfp_types
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_contig_cumsum_sint(dt):
+    get_queue_or_skip()
+    n = 10000
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n)
+
+    res = dpt.cumulative_sum(x, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_strided_cumsum_sint(dt):
+    get_queue_or_skip()
+    n = 10000
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n)[1::2]
+
+    res = dpt.cumulative_sum(x, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+    x2 = dpt.repeat(dpt.asarray([-1, 1], dtype=dt), 2 * n)[-1::-2]
+
+    res = dpt.cumulative_sum(x2, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_contig_cumsum_axis_sint(dt):
+    get_queue_or_skip()
+    n0, n1 = 1000, 173
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n0)
+    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))
+
+    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
+
+    ar = dpt.arange(n0, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_strided_cumsum_axis_sint(dt):
+    get_queue_or_skip()
+    n0, n1 = 1000, 173
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n0)
+    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))[1::2, ::-1]
+
+    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
+
+    ar = dpt.arange(n0, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
+
+
+def test_accumulate_scalar():
+    get_queue_or_skip()
+
+    s = dpt.asarray(1, dtype="i8")
+    r = dpt.cumulative_sum(s)
+    assert r == s
+    assert r.ndim == s.ndim
+
+    r = dpt.cumulative_sum(s, include_initial=True)
+    r_expected = dpt.asarray([0, 1], dtype="i8")
+    assert dpt.all(r == r_expected)
+
+
+def test_cumulative_sum_include_initial():
+    get_queue_or_skip()
+
+    n0, n1 = 3, 5
+    x = dpt.ones((n0, n1), dtype="i4")
+    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
+    assert dpt.all(r[0, :] == 0)
+
+    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
+    assert dpt.all(r[:, 0] == 0)
+
+    x = dpt.ones(n1, dtype="i4")
+    r = dpt.cumulative_sum(x, include_initial=True)
+    assert r.shape == (n1 + 1,)
+    assert r[0] == 0
+
+
+def test_cumulative_prod_identity():
+    get_queue_or_skip()
+
+    x = dpt.zeros(5, dtype="i4")
+    r = dpt.cumulative_prod(x, include_initial=True)
+    assert r[0] == 1
+
+
+def test_cumulative_logsumexp_identity():
+    get_queue_or_skip()
+
+    x = dpt.ones(5, dtype="f4")
+    r = dpt.cumulative_logsumexp(x, include_initial=True)
+    assert r[0] == -dpt.inf
+
+
+def test_accumulate_zero_size_dims():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 3, 0, 5
+    x = dpt.ones((n0, n1, n2), dtype="i8")
+    r = dpt.cumulative_sum(x, axis=1)
+    assert r.shape == x.shape
+    assert r.size == 0
+
+    r = dpt.cumulative_sum(x, axis=0)
+    assert r.shape == x.shape
+    assert r.size == 0
+
+    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
+    assert r.shape == (n0, n1 + 1, n2)
+    assert r.size == (n0 * n2)
+
+    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
+    assert r.shape == (n0 + 1, n1, n2)
+    assert r.size == 0
+
+
+@pytest.mark.parametrize("arg_dtype", all_types)
+def test_cumsum_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    n = 100
+    x = dpt.ones(n, dtype=arg_dtype)
+    r = dpt.cumulative_sum(x)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if x.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif x.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif x.dtype.kind == "fc":
+        assert r.dtype == arg_dtype
+
+    r_expected = dpt.arange(1, n + 1, dtype=r.dtype)
+
+    assert dpt.all(r == r_expected)
+
+
+@pytest.mark.parametrize("arg_dtype", all_types)
+@pytest.mark.parametrize("out_dtype", all_types)
+def test_cumsum_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    n = 100
+    x = dpt.ones(n, dtype=arg_dtype)
+    r = dpt.cumulative_sum(x, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    if out_dtype == dpt.bool:
+        assert dpt.all(r)
+    else:
+        r_expected = dpt.arange(1, n + 1, dtype=out_dtype)
+        assert dpt.all(r == r_expected)
+
+
+def test_accumulator_out_kwarg():
+    q = get_queue_or_skip()
+
+    n = 100
+
+    expected = dpt.arange(1, n + 1, dtype="i4", sycl_queue=q)
+    x = dpt.ones(n, dtype="i4", sycl_queue=q)
+    out = dpt.empty_like(x, dtype="i4")
+    dpt.cumulative_sum(x, dtype="i4", out=out)
+    assert dpt.all(expected == out)
+
+    # overlap
+    x = dpt.ones(n, dtype="i4", sycl_queue=q)
+    dpt.cumulative_sum(x, dtype="i4", out=x)
+    assert dpt.all(x == expected)
+
+    # axis before final axis
+    expected = dpt.broadcast_to(
+        dpt.arange(1, n + 1, dtype="i4", sycl_queue=q), (n, n)
+    ).mT
+    x = dpt.ones((n, n), dtype="i4", sycl_queue=q)
+    out = dpt.empty_like(x, dtype="i4")
+    dpt.cumulative_sum(x, axis=0, dtype="i4", out=out)
+    assert dpt.all(expected == out)
+
+    # scalar
+    x = dpt.asarray(3, dtype="i4")
+    out = dpt.empty((), dtype="i4")
+    expected = 3
+    dpt.cumulative_sum(x, dtype="i4", out=out)
+    assert expected == out
+
+
+def test_accumulator_arg_validation():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    n = 5
+    x1 = dpt.ones((n, n), dtype="f4", sycl_queue=q1)
+    x2 = dpt.ones(n, dtype="f4", sycl_queue=q1)
+
+    # must be usm_ndarray
+    with pytest.raises(TypeError):
+        dpt.cumulative_sum(dict())
+
+    # axis must be specified when input not 1D
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x1)
+
+    # out must be usm_ndarray
+    with pytest.raises(TypeError):
+        dpt.cumulative_sum(x2, out=dict())
+
+    # out must be writable
+    out_not_writable = dpt.empty_like(x2)
+    out_not_writable.flags.writable = False
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_not_writable)
+
+    # out must be expected shape
+    out_wrong_shape = dpt.ones(n + 1, dtype=x2.dtype, sycl_queue=q1)
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_wrong_shape)
+
+    # out must be expected dtype
+    out_wrong_dtype = dpt.empty_like(x2, dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_wrong_dtype)
+
+    # compute follows data
+    out_wrong_queue = dpt.empty_like(x2, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.cumulative_sum(x2, out=out_wrong_queue)
+
+
+def test_cumsum_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_sum(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+def test_cumprod_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_prod(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+def test_logcumsumexp_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_logsumexp(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+@pytest.mark.parametrize("arg_dtype", no_complex_types)
+def test_logcumsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.ones(10, dtype=arg_dtype, sycl_queue=q)
+    r = dpt.cumulative_logsumexp(x)
+
+    if arg_dtype.kind in "biu":
+        assert r.dtype.kind == "f"
+    else:
+        assert r.dtype == arg_dtype
+
+
+def test_logcumsumexp_complex_error():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.cumulative_logsumexp(x)
+
+
+def test_cumprod_basic():
+    get_queue_or_skip()
+
+    n = 50
+    val = 2
+    x = dpt.full(n, val, dtype="i8")
+    r = dpt.cumulative_prod(x)
+    expected = dpt.pow(val, dpt.arange(1, n + 1, dtype="i8"))
+
+    assert dpt.all(r == expected)
+
+    x = dpt.tile(dpt.asarray([2, 0.5], dtype="f4"), 10000)
+    expected = dpt.tile(dpt.asarray([2, 1], dtype="f4"), 10000)
+    r = dpt.cumulative_prod(x)
+    assert dpt.all(r == expected)
+
+
+def test_logcumsumexp_basic():
+    get_queue_or_skip()
+
+    dt = dpt.float32
+    x = dpt.ones(1000, dtype=dt)
+    r = dpt.cumulative_logsumexp(x)
+
+    expected = 1 + dpt.log(dpt.arange(1, 1001, dtype=dt))
+
+    tol = 4 * dpt.finfo(dt).resolution
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+
+def geometric_series_closed_form(n, dtype=None, device=None):
+    """Closed form for cumulative_logsumexp(dpt.arange(-n, 0))
+
+    :math:`r[k] == -n + k + log(1 - exp(-k-1)) - log(1-exp(-1))`
+    """
+    x = dpt.arange(-n, 0, dtype=dtype, device=device)
+    y = dpt.arange(-1, -n - 1, step=-1, dtype=dtype, device=device)
+    y = dpt.exp(y, out=y)
+    y = dpt.negative(y, out=y)
+    y = dpt.log1p(y, out=y)
+    y -= y[0]
+    return x + y
+
+
+@pytest.mark.parametrize("fpdt", rfp_types)
+def test_cumulative_logsumexp_closed_form(fpdt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(fpdt, q)
+
+    n = 128
+    r = dpt.cumulative_logsumexp(dpt.arange(-n, 0, dtype=fpdt, device=q))
+    expected = geometric_series_closed_form(n, dtype=fpdt, device=q)
+
+    tol = 4 * dpt.finfo(fpdt).eps
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)