diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 8638fc6d29..4355fea442 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -90,6 +90,7 @@ ) from dpctl.tensor._reshape import reshape from dpctl.tensor._search_functions import where +from dpctl.tensor._statistical_functions import mean, std, var from dpctl.tensor._usmarray import usm_ndarray from dpctl.tensor._utility_functions import all, any @@ -336,6 +337,9 @@ "clip", "logsumexp", "reduce_hypot", + "mean", + "std", + "var", "__array_api_version__", "__array_namespace_info__", ] diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index f797d24b0b..059ba61030 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -83,7 +83,6 @@ def _reduction_over_axis( _reduction_fn, _dtype_supported, _default_reduction_type_fn, - _identity=None, ): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") @@ -106,23 +105,8 @@ def _reduction_over_axis( res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) res_usm_type = x.usm_type - if x.size == 0: - if _identity is None: - raise ValueError("reduction does not support zero-size arrays") - else: - if keepdims: - res_shape = res_shape + (1,) * red_nd - inv_perm = sorted(range(nd), key=lambda d: perm[d]) - res_shape = tuple(res_shape[i] for i in inv_perm) - return dpt.full( - res_shape, - _identity, - dtype=res_dt, - usm_type=res_usm_type, - sycl_queue=q, - ) if red_nd == 0: - return dpt.astype(x, res_dt, copy=False) + return dpt.astype(x, res_dt, copy=True) host_tasks_list = [] if _dtype_supported(inp_dt, res_dt, res_usm_type, q): @@ -251,7 +235,6 @@ def sum(x, axis=None, dtype=None, keepdims=False): tri._sum_over_axis, tri._sum_over_axis_dtype_supported, _default_reduction_dtype, - _identity=0, ) @@ -312,7 +295,6 @@ def prod(x, axis=None, dtype=None, keepdims=False): tri._prod_over_axis, tri._prod_over_axis_dtype_supported, _default_reduction_dtype, - _identity=1, ) @@ -368,7 +350,6 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False): inp_dt, res_dt ), _default_reduction_dtype_fp_types, - _identity=-dpt.inf, ) @@ -424,7 +405,6 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False): inp_dt, res_dt ), _default_reduction_dtype_fp_types, - _identity=0, ) @@ -446,9 +426,19 @@ def _comparison_over_axis(x, axis, keepdims, _reduction_fn): res_dt = x.dtype res_usm_type = x.usm_type if x.size == 0: - raise ValueError("reduction does not support zero-size arrays") + if any([x.shape[i] == 0 for i in axis]): + raise ValueError( + "reduction cannot be performed over zero-size axes" + ) + else: + return dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) if red_nd == 0: - return x + return dpt.copy(x) res = dpt.empty( res_shape, @@ -549,7 +539,17 @@ def _search_over_axis(x, axis, keepdims, _reduction_fn): res_dt = ti.default_device_index_type(exec_q.sycl_device) res_usm_type = x.usm_type if x.size == 0: - raise ValueError("reduction does not support zero-size arrays") + if any([x.shape[i] == 0 for i in axis]): + raise ValueError( + "reduction cannot be performed over zero-size axes" + ) + else: + return dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) if red_nd == 0: return dpt.zeros( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py new file mode 100644 index 0000000000..54d748d2d2 --- /dev/null +++ b/dpctl/tensor/_statistical_functions.py @@ -0,0 +1,381 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numpy.core.numeric import normalize_axis_tuple + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei +import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri + + +def _var_impl(x, axis, correction, keepdims): + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + red_nd = len(axis) + perm = perm + list(axis) + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + inp_dt + if inp_dt.kind == "f" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + + deps = [] + host_tasks_list = [] + if inp_dt != res_dt: + buf = dpt.empty_like(x, dtype=res_dt) + ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=buf, sycl_queue=q + ) + deps.append(c_e1) + host_tasks_list.append(ht_e_buf) + else: + buf = x + # calculate mean + buf2 = dpt.permute_dims(buf, perm) + res_shape = buf2.shape[: nd - red_nd] + # use keepdims=True path for later broadcasting + if red_nd == 0: + mean_ary = dpt.empty_like(buf) + ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=buf, dst=mean_ary, sycl_queue=q + ) + deps.append(c_e2) + host_tasks_list.append(ht_e1) + else: + mean_ary = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + ht_e1, r_e1 = tri._sum_over_axis( + src=buf2, + trailing_dims_to_reduce=red_nd, + dst=mean_ary, + sycl_queue=q, + depends=deps, + ) + host_tasks_list.append(ht_e1) + deps.append(r_e1) + + mean_ary_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + mean_ary = dpt.permute_dims( + dpt.reshape(mean_ary, mean_ary_shape), inv_perm + ) + # divide in-place to get mean + mean_ary_shape = mean_ary.shape + nelems_ary = dpt.asarray( + nelems, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + if nelems_ary.shape != mean_ary_shape: + nelems_ary = dpt.broadcast_to(nelems_ary, mean_ary_shape) + ht_e2, d_e1 = tei._divide_inplace( + lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=deps + ) + host_tasks_list.append(ht_e2) + # subtract mean from original array to get deviations + dev_ary = dpt.empty_like(buf) + if mean_ary_shape != buf.shape: + mean_ary = dpt.broadcast_to(mean_ary, buf.shape) + ht_e4, su_e = tei._subtract( + src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1] + ) + host_tasks_list.append(ht_e4) + # square deviations + ht_e5, sq_e = tei._square( + src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e] + ) + host_tasks_list.append(ht_e5) + deps2 = [] + # take sum of squared deviations + dev_ary2 = dpt.permute_dims(dev_ary, perm) + if red_nd == 0: + res = dev_ary + deps2.append(sq_e) + else: + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + ht_e6, r_e2 = tri._sum_over_axis( + src=dev_ary2, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=q, + depends=[sq_e], + ) + host_tasks_list.append(ht_e6) + deps2.append(r_e2) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + res_shape = res.shape + # when nelems - correction <= 0, yield nans + div = max(nelems - correction, 0) + if not div: + div = dpt.nan + div_ary = dpt.asarray(div, res_dt, usm_type=res_usm_type, sycl_queue=q) + # divide in-place again + if div_ary.shape != res_shape: + div_ary = dpt.broadcast_to(div_ary, res.shape) + ht_e7, d_e2 = tei._divide_inplace( + lhs=res, rhs=div_ary, sycl_queue=q, depends=deps2 + ) + host_tasks_list.append(ht_e7) + return res, [d_e2], host_tasks_list + + +def mean(x, axis=None, keepdims=False): + """mean(x, axis=None, keepdims=False) + + Calculates the arithmetic mean of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the arithmetic means must be computed. If + a tuple of unique integers, the means are computed over multiple + axes. If `None`, the mean is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the arithmetic means. If the mean was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a floating-point data type, the returned array will have + the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + sum_nd = len(axis) + perm = perm + list(axis) + arr2 = dpt.permute_dims(x, perm) + res_shape = arr2.shape[: nd - sum_nd] + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + x.dtype + if x.dtype.kind in "fc" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + if sum_nd == 0: + return dpt.astype(x, res_dt, copy=True) + + s_e = [] + host_tasks_list = [] + if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e1, r_e = tri._sum_over_axis( + src=arr2, trailing_dims_to_reduce=sum_nd, dst=res, sycl_queue=q + ) + host_tasks_list.append(ht_e1) + s_e.append(r_e) + else: + tmp = dpt.empty( + arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = tri._sum_over_axis( + src=tmp, + trailing_dims_to_reduce=sum_nd, + dst=res, + sycl_queue=q, + depends=[cpy_e], + ) + host_tasks_list.append(ht_e_red) + s_e.append(r_e) + + if keepdims: + res_shape = res_shape + (1,) * sum_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + + res_shape = res.shape + # in-place divide + den_dt = dpt.finfo(res_dt).dtype if res_dt.kind == "c" else res_dt + nelems_arr = dpt.asarray( + nelems, dtype=den_dt, usm_type=res_usm_type, sycl_queue=q + ) + if nelems_arr.shape != res_shape: + nelems_arr = dpt.broadcast_to(nelems_arr, res_shape) + ht_e2, _ = tei._divide_inplace( + lhs=res, rhs=nelems_arr, sycl_queue=q, depends=s_e + ) + host_tasks_list.append(ht_e2) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def var(x, axis=None, correction=0.0, keepdims=False): + """var(x, axis=None, correction=0.0, keepdims=False) + + Calculates the variance of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the variances must be computed. If a tuple + of unique integers, the variances are computed over multiple axes. + If `None`, the variance is computed over the entire array. + Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + variance is `N-correction`, where `N` corresponds to the total + number of elements over which the variance is calculated. + Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the variances. If the variance was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`, got" + f"{type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`var` does not support complex types") + + res, _, host_tasks_list = _var_impl(x, axis, correction, keepdims) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def std(x, axis=None, correction=0.0, keepdims=False): + """std(x, axis=None, correction=0.0, keepdims=False) + + Calculates the standard deviation of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the standard deviations must be computed. + If a tuple of unique integers, the standard deviations are computed + over multiple axes. If `None`, the standard deviation is computed + over the entire array. Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + standard deviation is `N-correction`, where `N` corresponds to the + total number of elements over which the standard deviation is + calculated. Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the standard deviations. If the standard + deviation was computed over the entire array, a zero-dimensional + array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`," + f"got {type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`std` does not support complex types") + + res, deps, host_tasks_list = _var_impl(x, axis, correction, keepdims) + ht_ev, _ = tei._sqrt( + src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps + ) + host_tasks_list.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 6651483c6c..adbf96be10 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1009,6 +1009,9 @@ template class custom_reduction_over_group_temps_strided_krn; +template +class reduction_over_group_temps_empty_krn; + template class single_reduction_axis0_temps_contig_krn; @@ -1120,6 +1123,31 @@ sycl::event reduction_over_group_temps_strided_impl( constexpr resTy identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const py::ssize_t *const &res_shape = iter_shape_and_strides; + const py::ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class reduction_over_group_temps_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -1244,7 +1272,7 @@ sycl::event reduction_over_group_temps_strided_impl( resTy *partially_reduced_tmp2 = nullptr; if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); + throw std::runtime_error("Unable to allocate device_memory"); } else { partially_reduced_tmp2 = @@ -1501,6 +1529,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( constexpr resTy identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -1632,7 +1667,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( resTy *partially_reduced_tmp2 = nullptr; if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); + throw std::runtime_error("Unable to allocate device_memory"); } else { partially_reduced_tmp2 = @@ -1879,6 +1914,13 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( constexpr resTy identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -2015,7 +2057,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( resTy *partially_reduced_tmp2 = nullptr; if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); + throw std::runtime_error("Unable to allocate device_memory"); } else { partially_reduced_tmp2 = @@ -2712,12 +2754,16 @@ struct TypePairSupportDataForSumReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int8_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint8_t td_ns::TypePairDefinedEntry, @@ -2727,11 +2773,15 @@ struct TypePairSupportDataForSumReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int16_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint16_t td_ns::TypePairDefinedEntry, @@ -2739,20 +2789,30 @@ struct TypePairSupportDataForSumReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int64_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, - // input uint32_t + // input uint64_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input half td_ns::TypePairDefinedEntry, @@ -2967,12 +3027,16 @@ struct TypePairSupportDataForProductReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int8_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint8_t td_ns::TypePairDefinedEntry, @@ -2982,11 +3046,15 @@ struct TypePairSupportDataForProductReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int16_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint16_t td_ns::TypePairDefinedEntry, @@ -2994,20 +3062,30 @@ struct TypePairSupportDataForProductReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int64_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint32_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input half td_ns::TypePairDefinedEntry, @@ -3957,6 +4035,8 @@ template class custom_search_over_group_temps_strided_krn; +template class search_empty_krn; + template ::value; constexpr resTy idx_identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const py::ssize_t *const &res_shape = iter_shape_and_strides; + const py::ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class search_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = idx_identity_val; + }); + }); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -4590,6 +4694,13 @@ sycl::event search_axis1_over_group_temps_contig_impl( constexpr argTy identity_val = su_ns::Identity::value; constexpr resTy idx_identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -5005,6 +5116,13 @@ sycl::event search_axis0_over_group_temps_contig_impl( constexpr argTy identity_val = su_ns::Identity::value; constexpr resTy idx_identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index f1b924dd47..5aafe38a40 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -205,6 +205,10 @@ std::pair py_reduction_over_axis( size_t dst_nelems = dst.get_size(); + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { reduction_nelems *= static_cast(src_shape_ptr[i]); @@ -551,6 +555,10 @@ std::pair py_tree_reduction_over_axis( size_t dst_nelems = dst.get_size(); + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { reduction_nelems *= static_cast(src_shape_ptr[i]); @@ -842,6 +850,10 @@ std::pair py_search_over_axis( size_t dst_nelems = dst.get_size(); + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { reduction_nelems *= static_cast(src_shape_ptr[i]); diff --git a/dpctl/tests/test_tensor_statistical_functions.py b/dpctl/tests/test_tensor_statistical_functions.py new file mode 100644 index 0000000000..8916833f86 --- /dev/null +++ b/dpctl/tests/test_tensor_statistical_functions.py @@ -0,0 +1,254 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import default_device_fp_type +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +_no_complex_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", +] + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +def test_mean_dtypes(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.mean(x) + assert res == 1 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +@pytest.mark.parametrize("py_zero", [float(0), int(0)]) +def test_std_var_dtypes(dt, py_zero): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.std(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + res = dpt.var(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +def test_stat_fns_axis(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1)) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 6) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1)) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +@pytest.mark.parametrize("fn", [dpt.mean, dpt.var]) +def test_stat_fns_empty(fn): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + r = fn(x) + assert r.shape == tuple() + assert dpt.isnan(r) + + x = dpt.empty((10, 0, 2), dtype="f4") + r = fn(x, axis=1) + assert r.shape == (10, 2) + assert dpt.all(dpt.isnan(r)) + + r = fn(x, axis=0) + assert r.shape == (0, 2) + assert r.size == 0 + + +def test_stat_fns_keepdims(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1), keepdims=True) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1), keepdims=True) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +def test_stat_fns_empty_axis(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + m = dpt.mean(x, axis=()) + + assert x.shape == m.shape + assert dpt.all(x == m) + + s = dpt.var(x, axis=()) + assert x.shape == s.shape + assert dpt.all(s == 0) + + d = dpt.std(x, axis=()) + assert x.shape == d.shape + assert dpt.all(d == 0) + + +def test_mean(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + m = dpt.mean(x) + expected = dpt.asarray(4, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=0) + expected = dpt.arange(3, 6, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=1) + expected = dpt.asarray([1, 4, 7], dtype="f4") + assert dpt.allclose(m, expected) + + +def test_var_std(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + r = dpt.var(x) + expected = dpt.asarray(6.666666507720947, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, correction=3) + expected1 = dpt.asarray(10.0, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, correction=3) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=0) + expected = dpt.full(x.shape[1], 6, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=0, correction=1) + expected1 = dpt.full(x.shape[1], 9, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=0) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=0, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=1) + expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=1, correction=1) + expected1 = dpt.ones(x.shape[0], dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=1) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=1, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + +def test_var_axis_length_correction(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + + r = dpt.var(x, correction=x.size) + assert dpt.isnan(r) + + r = dpt.var(x, axis=0, correction=x.shape[0]) + assert dpt.all(dpt.isnan(r)) + + r = dpt.var(x, axis=1, correction=x.shape[1]) + assert dpt.all(dpt.isnan(r)) + + +def test_stat_function_errors(): + d = dict() + with pytest.raises(TypeError): + dpt.var(d) + with pytest.raises(TypeError): + dpt.std(d) + with pytest.raises(TypeError): + dpt.mean(d) + + x = dpt.empty(1, dtype="f4") + with pytest.raises(TypeError): + dpt.var(x, axis=d) + with pytest.raises(TypeError): + dpt.std(x, axis=d) + with pytest.raises(TypeError): + dpt.mean(x, axis=d) + + with pytest.raises(TypeError): + dpt.var(x, correction=d) + with pytest.raises(TypeError): + dpt.std(x, correction=d) + + x = dpt.empty(1, dtype="c8") + with pytest.raises(ValueError): + dpt.var(x) + with pytest.raises(ValueError): + dpt.std(x)