diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 8638fc6d29..4355fea442 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -90,6 +90,7 @@
 )
 from dpctl.tensor._reshape import reshape
 from dpctl.tensor._search_functions import where
+from dpctl.tensor._statistical_functions import mean, std, var
 from dpctl.tensor._usmarray import usm_ndarray
 from dpctl.tensor._utility_functions import all, any
 
@@ -336,6 +337,9 @@
     "clip",
     "logsumexp",
     "reduce_hypot",
+    "mean",
+    "std",
+    "var",
     "__array_api_version__",
     "__array_namespace_info__",
 ]
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index f797d24b0b..059ba61030 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -83,7 +83,6 @@ def _reduction_over_axis(
     _reduction_fn,
     _dtype_supported,
     _default_reduction_type_fn,
-    _identity=None,
 ):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -106,23 +105,8 @@ def _reduction_over_axis(
         res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
 
     res_usm_type = x.usm_type
-    if x.size == 0:
-        if _identity is None:
-            raise ValueError("reduction does not support zero-size arrays")
-        else:
-            if keepdims:
-                res_shape = res_shape + (1,) * red_nd
-                inv_perm = sorted(range(nd), key=lambda d: perm[d])
-                res_shape = tuple(res_shape[i] for i in inv_perm)
-            return dpt.full(
-                res_shape,
-                _identity,
-                dtype=res_dt,
-                usm_type=res_usm_type,
-                sycl_queue=q,
-            )
     if red_nd == 0:
-        return dpt.astype(x, res_dt, copy=False)
+        return dpt.astype(x, res_dt, copy=True)
 
     host_tasks_list = []
     if _dtype_supported(inp_dt, res_dt, res_usm_type, q):
@@ -251,7 +235,6 @@ def sum(x, axis=None, dtype=None, keepdims=False):
         tri._sum_over_axis,
         tri._sum_over_axis_dtype_supported,
         _default_reduction_dtype,
-        _identity=0,
     )
 
 
@@ -312,7 +295,6 @@ def prod(x, axis=None, dtype=None, keepdims=False):
         tri._prod_over_axis,
         tri._prod_over_axis_dtype_supported,
         _default_reduction_dtype,
-        _identity=1,
     )
 
 
@@ -368,7 +350,6 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False):
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
-        _identity=-dpt.inf,
     )
 
 
@@ -424,7 +405,6 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False):
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
-        _identity=0,
     )
 
 
@@ -446,9 +426,19 @@ def _comparison_over_axis(x, axis, keepdims, _reduction_fn):
     res_dt = x.dtype
     res_usm_type = x.usm_type
     if x.size == 0:
-        raise ValueError("reduction does not support zero-size arrays")
+        if any([x.shape[i] == 0 for i in axis]):
+            raise ValueError(
+                "reduction cannot be performed over zero-size axes"
+            )
+        else:
+            return dpt.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+            )
     if red_nd == 0:
-        return x
+        return dpt.copy(x)
 
     res = dpt.empty(
         res_shape,
@@ -549,7 +539,17 @@ def _search_over_axis(x, axis, keepdims, _reduction_fn):
     res_dt = ti.default_device_index_type(exec_q.sycl_device)
     res_usm_type = x.usm_type
     if x.size == 0:
-        raise ValueError("reduction does not support zero-size arrays")
+        if any([x.shape[i] == 0 for i in axis]):
+            raise ValueError(
+                "reduction cannot be performed over zero-size axes"
+            )
+        else:
+            return dpt.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+            )
     if red_nd == 0:
         return dpt.zeros(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py
new file mode 100644
index 0000000000..54d748d2d2
--- /dev/null
+++ b/dpctl/tensor/_statistical_functions.py
@@ -0,0 +1,381 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from numpy.core.numeric import normalize_axis_tuple
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
+import dpctl.tensor._tensor_impl as ti
+import dpctl.tensor._tensor_reductions_impl as tri
+
+
+def _var_impl(x, axis, correction, keepdims):
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    red_nd = len(axis)
+    perm = perm + list(axis)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        inp_dt
+        if inp_dt.kind == "f"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+
+    deps = []
+    host_tasks_list = []
+    if inp_dt != res_dt:
+        buf = dpt.empty_like(x, dtype=res_dt)
+        ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=buf, sycl_queue=q
+        )
+        deps.append(c_e1)
+        host_tasks_list.append(ht_e_buf)
+    else:
+        buf = x
+    # calculate mean
+    buf2 = dpt.permute_dims(buf, perm)
+    res_shape = buf2.shape[: nd - red_nd]
+    # use keepdims=True path for later broadcasting
+    if red_nd == 0:
+        mean_ary = dpt.empty_like(buf)
+        ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=buf, dst=mean_ary, sycl_queue=q
+        )
+        deps.append(c_e2)
+        host_tasks_list.append(ht_e1)
+    else:
+        mean_ary = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        ht_e1, r_e1 = tri._sum_over_axis(
+            src=buf2,
+            trailing_dims_to_reduce=red_nd,
+            dst=mean_ary,
+            sycl_queue=q,
+            depends=deps,
+        )
+        host_tasks_list.append(ht_e1)
+        deps.append(r_e1)
+
+        mean_ary_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        mean_ary = dpt.permute_dims(
+            dpt.reshape(mean_ary, mean_ary_shape), inv_perm
+        )
+    # divide in-place to get mean
+    mean_ary_shape = mean_ary.shape
+    nelems_ary = dpt.asarray(
+        nelems, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+    )
+    if nelems_ary.shape != mean_ary_shape:
+        nelems_ary = dpt.broadcast_to(nelems_ary, mean_ary_shape)
+    ht_e2, d_e1 = tei._divide_inplace(
+        lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=deps
+    )
+    host_tasks_list.append(ht_e2)
+    # subtract mean from original array to get deviations
+    dev_ary = dpt.empty_like(buf)
+    if mean_ary_shape != buf.shape:
+        mean_ary = dpt.broadcast_to(mean_ary, buf.shape)
+    ht_e4, su_e = tei._subtract(
+        src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1]
+    )
+    host_tasks_list.append(ht_e4)
+    # square deviations
+    ht_e5, sq_e = tei._square(
+        src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e]
+    )
+    host_tasks_list.append(ht_e5)
+    deps2 = []
+    # take sum of squared deviations
+    dev_ary2 = dpt.permute_dims(dev_ary, perm)
+    if red_nd == 0:
+        res = dev_ary
+        deps2.append(sq_e)
+    else:
+        res = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        ht_e6, r_e2 = tri._sum_over_axis(
+            src=dev_ary2,
+            trailing_dims_to_reduce=red_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[sq_e],
+        )
+        host_tasks_list.append(ht_e6)
+        deps2.append(r_e2)
+
+        if keepdims:
+            res_shape = res_shape + (1,) * red_nd
+            inv_perm = sorted(range(nd), key=lambda d: perm[d])
+            res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    res_shape = res.shape
+    # when nelems - correction <= 0, yield nans
+    div = max(nelems - correction, 0)
+    if not div:
+        div = dpt.nan
+    div_ary = dpt.asarray(div, res_dt, usm_type=res_usm_type, sycl_queue=q)
+    # divide in-place again
+    if div_ary.shape != res_shape:
+        div_ary = dpt.broadcast_to(div_ary, res.shape)
+    ht_e7, d_e2 = tei._divide_inplace(
+        lhs=res, rhs=div_ary, sycl_queue=q, depends=deps2
+    )
+    host_tasks_list.append(ht_e7)
+    return res, [d_e2], host_tasks_list
+
+
+def mean(x, axis=None, keepdims=False):
+    """mean(x, axis=None, keepdims=False)
+
+    Calculates the arithmetic mean of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the arithmetic means must be computed. If
+            a tuple of unique integers, the means are computed over multiple
+            axes. If `None`, the mean is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the arithmetic means. If the mean was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a floating-point data type, the returned array will have
+            the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    sum_nd = len(axis)
+    perm = perm + list(axis)
+    arr2 = dpt.permute_dims(x, perm)
+    res_shape = arr2.shape[: nd - sum_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        x.dtype
+        if x.dtype.kind in "fc"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+    if sum_nd == 0:
+        return dpt.astype(x, res_dt, copy=True)
+
+    s_e = []
+    host_tasks_list = []
+    if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
+        res = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e1, r_e = tri._sum_over_axis(
+            src=arr2, trailing_dims_to_reduce=sum_nd, dst=res, sycl_queue=q
+        )
+        host_tasks_list.append(ht_e1)
+        s_e.append(r_e)
+    else:
+        tmp = dpt.empty(
+            arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr2, dst=tmp, sycl_queue=q
+        )
+        host_tasks_list.append(ht_e_cpy)
+        res = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_red, r_e = tri._sum_over_axis(
+            src=tmp,
+            trailing_dims_to_reduce=sum_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[cpy_e],
+        )
+        host_tasks_list.append(ht_e_red)
+        s_e.append(r_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * sum_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+
+    res_shape = res.shape
+    # in-place divide
+    den_dt = dpt.finfo(res_dt).dtype if res_dt.kind == "c" else res_dt
+    nelems_arr = dpt.asarray(
+        nelems, dtype=den_dt, usm_type=res_usm_type, sycl_queue=q
+    )
+    if nelems_arr.shape != res_shape:
+        nelems_arr = dpt.broadcast_to(nelems_arr, res_shape)
+    ht_e2, _ = tei._divide_inplace(
+        lhs=res, rhs=nelems_arr, sycl_queue=q, depends=s_e
+    )
+    host_tasks_list.append(ht_e2)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
+
+
+def var(x, axis=None, correction=0.0, keepdims=False):
+    """var(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the variance of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the variances must be computed. If a tuple
+            of unique integers, the variances are computed over multiple axes.
+            If `None`, the variance is computed over the entire array.
+            Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            variance is `N-correction`, where `N` corresponds to the total
+            number of elements over which the variance is calculated.
+            Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the variances. If the variance was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`, got"
+            f"{type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`var` does not support complex types")
+
+    res, _, host_tasks_list = _var_impl(x, axis, correction, keepdims)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
+
+
+def std(x, axis=None, correction=0.0, keepdims=False):
+    """std(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the standard deviation of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the standard deviations must be computed.
+            If a tuple of unique integers, the standard deviations are computed
+            over multiple axes. If `None`, the standard deviation is computed
+            over the entire array. Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            standard deviation is `N-correction`, where `N` corresponds to the
+            total number of elements over which the standard deviation is
+            calculated. Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the standard deviations. If the standard
+            deviation was computed over the entire array, a zero-dimensional
+            array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`,"
+            f"got {type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`std` does not support complex types")
+
+    res, deps, host_tasks_list = _var_impl(x, axis, correction, keepdims)
+    ht_ev, _ = tei._sqrt(
+        src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps
+    )
+    host_tasks_list.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 6651483c6c..adbf96be10 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -1009,6 +1009,9 @@ template <typename T1,
           typename T6>
 class custom_reduction_over_group_temps_strided_krn;
 
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_temps_empty_krn;
+
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class single_reduction_axis0_temps_contig_krn;
 
@@ -1120,6 +1123,31 @@ sycl::event reduction_over_group_temps_strided_impl(
 
     constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const py::ssize_t *const &res_shape = iter_shape_and_strides;
+            const py::ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                 res_strides);
+            using InitKernelName =
+                class reduction_over_group_temps_empty_krn<resTy, argTy,
+                                                           ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -1244,7 +1272,7 @@ sycl::event reduction_over_group_temps_strided_impl(
         resTy *partially_reduced_tmp2 = nullptr;
 
         if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
+            throw std::runtime_error("Unable to allocate device_memory");
         }
         else {
             partially_reduced_tmp2 =
@@ -1501,6 +1529,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
 
     constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -1632,7 +1667,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
         resTy *partially_reduced_tmp2 = nullptr;
 
         if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
+            throw std::runtime_error("Unable to allocate device_memory");
         }
         else {
             partially_reduced_tmp2 =
@@ -1879,6 +1914,13 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
 
     constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -2015,7 +2057,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
         resTy *partially_reduced_tmp2 = nullptr;
 
         if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
+            throw std::runtime_error("Unable to allocate device_memory");
         }
         else {
             partially_reduced_tmp2 =
@@ -2712,12 +2754,16 @@ struct TypePairSupportDataForSumReductionTemps
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
 
         // input int8_t
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
 
         // input uint8_t
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
@@ -2727,11 +2773,15 @@ struct TypePairSupportDataForSumReductionTemps
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
 
         // input int16_t
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
 
         // input uint16_t
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
@@ -2739,20 +2789,30 @@ struct TypePairSupportDataForSumReductionTemps
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
 
         // input int32_t
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
 
         // input uint32_t
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
 
         // input int64_t
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
 
-        // input uint32_t
+        // input uint64_t
         td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
 
         // input half
         td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
@@ -2967,12 +3027,16 @@ struct TypePairSupportDataForProductReductionTemps
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
 
         // input int8_t
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
 
         // input uint8_t
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
@@ -2982,11 +3046,15 @@ struct TypePairSupportDataForProductReductionTemps
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
 
         // input int16_t
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
 
         // input uint16_t
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
@@ -2994,20 +3062,30 @@ struct TypePairSupportDataForProductReductionTemps
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
 
         // input int32_t
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
 
         // input uint32_t
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
 
         // input int64_t
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
 
         // input uint32_t
         td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
 
         // input half
         td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
@@ -3957,6 +4035,8 @@ template <typename T1,
           bool b2>
 class custom_search_over_group_temps_strided_krn;
 
+template <typename T1, typename T2, typename T3> class search_empty_krn;
+
 template <typename T1,
           typename T2,
           typename T3,
@@ -4160,6 +4240,30 @@ sycl::event search_over_group_temps_strided_impl(
     constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
     constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const py::ssize_t *const &res_shape = iter_shape_and_strides;
+            const py::ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                 res_strides);
+            using InitKernelName =
+                class search_empty_krn<resTy, argTy, ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = idx_identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -4590,6 +4694,13 @@ sycl::event search_axis1_over_group_temps_contig_impl(
     constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
     constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -5005,6 +5116,13 @@ sycl::event search_axis0_over_group_temps_contig_impl(
     constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
     constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
index f1b924dd47..5aafe38a40 100644
--- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -205,6 +205,10 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
 
     size_t dst_nelems = dst.get_size();
 
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
     size_t reduction_nelems(1);
     for (int i = dst_nd; i < src_nd; ++i) {
         reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
@@ -551,6 +555,10 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
 
     size_t dst_nelems = dst.get_size();
 
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
     size_t reduction_nelems(1);
     for (int i = dst_nd; i < src_nd; ++i) {
         reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
@@ -842,6 +850,10 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
 
     size_t dst_nelems = dst.get_size();
 
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
     size_t reduction_nelems(1);
     for (int i = dst_nd; i < src_nd; ++i) {
         reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
diff --git a/dpctl/tests/test_tensor_statistical_functions.py b/dpctl/tests/test_tensor_statistical_functions.py
new file mode 100644
index 0000000000..8916833f86
--- /dev/null
+++ b/dpctl/tests/test_tensor_statistical_functions.py
@@ -0,0 +1,254 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tensor._tensor_impl import default_device_fp_type
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+_no_complex_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+]
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+def test_mean_dtypes(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.mean(x)
+    assert res == 1
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+@pytest.mark.parametrize("py_zero", [float(0), int(0)])
+def test_std_var_dtypes(dt, py_zero):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.std(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+    res = dpt.var(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+def test_stat_fns_axis():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1))
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 6)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1))
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+@pytest.mark.parametrize("fn", [dpt.mean, dpt.var])
+def test_stat_fns_empty(fn):
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    r = fn(x)
+    assert r.shape == tuple()
+    assert dpt.isnan(r)
+
+    x = dpt.empty((10, 0, 2), dtype="f4")
+    r = fn(x, axis=1)
+    assert r.shape == (10, 2)
+    assert dpt.all(dpt.isnan(r))
+
+    r = fn(x, axis=0)
+    assert r.shape == (0, 2)
+    assert r.size == 0
+
+
+def test_stat_fns_keepdims():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1), keepdims=True)
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+def test_stat_fns_empty_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+    m = dpt.mean(x, axis=())
+
+    assert x.shape == m.shape
+    assert dpt.all(x == m)
+
+    s = dpt.var(x, axis=())
+    assert x.shape == s.shape
+    assert dpt.all(s == 0)
+
+    d = dpt.std(x, axis=())
+    assert x.shape == d.shape
+    assert dpt.all(d == 0)
+
+
+def test_mean():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    m = dpt.mean(x)
+    expected = dpt.asarray(4, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=0)
+    expected = dpt.arange(3, 6, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=1)
+    expected = dpt.asarray([1, 4, 7], dtype="f4")
+    assert dpt.allclose(m, expected)
+
+
+def test_var_std():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    r = dpt.var(x)
+    expected = dpt.asarray(6.666666507720947, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, correction=3)
+    expected1 = dpt.asarray(10.0, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, correction=3)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=0)
+    expected = dpt.full(x.shape[1], 6, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=0, correction=1)
+    expected1 = dpt.full(x.shape[1], 9, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=0)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=0, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=1)
+    expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=1, correction=1)
+    expected1 = dpt.ones(x.shape[0], dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=1)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=1, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+
+def test_var_axis_length_correction():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+
+    r = dpt.var(x, correction=x.size)
+    assert dpt.isnan(r)
+
+    r = dpt.var(x, axis=0, correction=x.shape[0])
+    assert dpt.all(dpt.isnan(r))
+
+    r = dpt.var(x, axis=1, correction=x.shape[1])
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_stat_function_errors():
+    d = dict()
+    with pytest.raises(TypeError):
+        dpt.var(d)
+    with pytest.raises(TypeError):
+        dpt.std(d)
+    with pytest.raises(TypeError):
+        dpt.mean(d)
+
+    x = dpt.empty(1, dtype="f4")
+    with pytest.raises(TypeError):
+        dpt.var(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.mean(x, axis=d)
+
+    with pytest.raises(TypeError):
+        dpt.var(x, correction=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, correction=d)
+
+    x = dpt.empty(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.var(x)
+    with pytest.raises(ValueError):
+        dpt.std(x)