diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index aba009411d..8e1c3b3b57 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -50,6 +50,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) set(python_module_name _tensor_impl) @@ -65,6 +66,7 @@ set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index bab31379b7..209a6d4e56 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -93,6 +93,7 @@ from dpctl.tensor._usmarray import usm_ndarray from dpctl.tensor._utility_functions import all, any +from ._clip import clip from ._constants import e, inf, nan, newaxis, pi from ._elementwise_funcs import ( abs, @@ -322,4 +323,5 @@ "exp2", "copysign", "rsqrt", + "clip", ] diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py new file mode 100644 index 0000000000..5a3a96933f --- /dev/null +++ b/dpctl/tensor/_clip.py @@ -0,0 +1,837 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_impl as ti +from dpctl.tensor._copy_utils import ( + _empty_like_orderK, + _empty_like_pair_orderK, + _empty_like_triple_orderK, +) +from dpctl.tensor._elementwise_common import ( + WeakBooleanType, + WeakComplexType, + WeakFloatingType, + WeakIntegralType, + _get_dtype, + _get_queue_usm_type, + _get_shape, + _strong_dtype_num_kind, + _validate_dtype, + _weak_type_num_kind, +) +from dpctl.tensor._manipulation_functions import _broadcast_shape_impl +from dpctl.tensor._type_utils import _can_cast, _to_device_supported_dtype +from dpctl.utils import ExecutionPlacementError + + +def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev): + "Resolves weak data types per NEP-0050," + "where the second and third arguments are" + "permitted to be weak types" + if isinstance( + st_dtype, + ( + WeakBooleanType, + WeakIntegralType, + WeakFloatingType, + WeakComplexType, + ), + ): + raise ValueError + if isinstance( + dtype1, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + if isinstance( + dtype2, + ( + WeakBooleanType, + WeakIntegralType, + WeakFloatingType, + WeakComplexType, + ), + ): + kind_num1 = _weak_type_num_kind(dtype1) + kind_num2 = _weak_type_num_kind(dtype2) + st_kind_num = _strong_dtype_num_kind(st_dtype) + + if kind_num1 > st_kind_num: + if isinstance(dtype1, WeakIntegralType): + ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype1, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype1 = dpt.complex64 + ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype1 = st_dtype + + if kind_num2 > st_kind_num: + if isinstance(dtype2, WeakIntegralType): + ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype2, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype2 = dpt.complex64 + ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype2 = st_dtype + + return ret_dtype1, ret_dtype2 + + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype2), dtype2), + ] + ) + dt1_kind_num = _weak_type_num_kind(dtype1) + if dt1_kind_num > max_dt_num_kind: + if isinstance(dtype1, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), dtype2 + if isinstance(dtype1, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dpt.complex64, dtype2 + return ( + _to_device_supported_dtype(dpt.complex128, dev), + dtype2, + ) + return _to_device_supported_dtype(dpt.float64, dev), dtype2 + else: + return max_dtype, dtype2 + elif isinstance( + dtype2, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype1), dtype1), + ] + ) + dt2_kind_num = _weak_type_num_kind(dtype2) + if dt2_kind_num > max_dt_num_kind: + if isinstance(dtype2, WeakIntegralType): + return dtype1, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype2, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dtype1, dpt.complex64 + return ( + dtype1, + _to_device_supported_dtype(dpt.complex128, dev), + ) + return dtype1, _to_device_supported_dtype(dpt.float64, dev) + else: + return dtype1, max_dtype + else: + # both are strong dtypes + # return unmodified + return dtype1, dtype2 + + +def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev): + "Resolves one weak data type with one strong data type per NEP-0050" + if isinstance( + st_dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + raise ValueError + if isinstance( + dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + st_kind_num = _strong_dtype_num_kind(st_dtype) + kind_num = _weak_type_num_kind(dtype) + if kind_num > st_kind_num: + if isinstance(dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + return st_dtype, dpt.complex64 + return _to_device_supported_dtype(dpt.complex128, dev) + return (_to_device_supported_dtype(dpt.float64, dev),) + else: + return st_dtype + else: + return dtype + + +def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev): + "Checks if both types `arg1_dtype` and `arg2_dtype` can be" + "cast to `res_dtype` according to the rule `safe`" + if arg1_dtype == res_dtype and arg2_dtype == res_dtype: + return None, None, res_dtype + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast( + arg2_dtype, res_dtype, _fp16, _fp64 + ): + # prevent unnecessary casting + ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype + ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype + return ret_buf1_dt, ret_buf2_dt, res_dtype + else: + return None, None, None + + +def _clip_none(x, val, out, order, _binary_fn): + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, val_usm_type = _get_queue_usm_type(val) + if q2 is None: + exec_q = q1 + res_usm_type = x_usm_type + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + val_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + val_shape = _get_shape(val) + if not isinstance(val_shape, (tuple, list)): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + val_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape} and {val_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + val_dtype = _get_dtype(val, sycl_dev) + if not _validate_dtype(val_dtype): + raise ValueError("Operands have unsupported data types") + + val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev) + + res_dt = x.dtype + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if not _can_cast(val_dtype, res_dt, _fp16, _fp64): + raise ValueError( + f"function 'clip' does not support input types " + f"({x_dtype}, {val_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + if ( + ti._array_overlap(val, out) + and not ti._same_logical_tensors(val, out) + and val_dtype == res_dt + ): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + val_ary = val + else: + val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q) + + if val_dtype == res_dt: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, val_ary, res_dt, res_shape, res_usm_type, exec_q + ) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + val_ary, + ) + ) + else "C" + ) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if val_ary.shape != res_shape: + val_ary = dpt.broadcast_to(val_ary, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, src2=val_ary, dst=out, sycl_queue=exec_q + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_binary_ev.wait() + return out + else: + if order == "K": + buf = _empty_like_orderK(val_ary, res_dt) + else: + if order == "A": + order = "F" if x.flags.f_contiguous else "C" + buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=val_ary, dst=buf, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, buf, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + buf = dpt.broadcast_to(buf, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, + src2=buf, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + +# need to handle logic for min or max being None +def clip(x, min=None, max=None, out=None, order="K"): + """clip(x, min, max, out=None, order="K") + + Clips to the range [`min_i`, `max_i`] for each element `x_i` + in `x`. + + Args: + x (usm_ndarray): Array containing elements to clip. + Must be compatible with `min` and `max` according + to broadcasting rules. + min ({None, usm_ndarray}, optional): Array containing minimum values. + Must be compatible with `x` and `max` according + to broadcasting rules. + Only one of `min` and `max` can be `None`. + max ({None, usm_ndarray}, optional): Array containing maximum values. + Must be compatible with `x` and `min` according + to broadcasting rules. + Only one of `min` and `max` can be `None`. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is + `None`. + Default: "K". + + Returns: + usm_ndarray: + An array with elements clipped to the range [`min`, `max`]. + The returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected `x` to be of dpctl.tensor.usm_ndarray type, got " + f"{type(x)}" + ) + if min is None and max is None: + raise ValueError( + "only one of `min` and `max` is permitted to be `None`" + ) + elif max is None: + return _clip_none(x, min, out, order, ti._maximum) + elif min is None: + return _clip_none(x, max, out, order, ti._minimum) + else: + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, min_usm_type = _get_queue_usm_type(min) + q3, max_usm_type = _get_queue_usm_type(max) + if q2 is None and q3 is None: + exec_q = q1 + res_usm_type = x_usm_type + elif q3 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + ) + ) + elif q2 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + max_usm_type, + ) + ) + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + max_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + min_shape = _get_shape(min) + max_shape = _get_shape(max) + if not all( + isinstance(s, (tuple, list)) + for s in ( + min_shape, + max_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + min_shape, + max_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape}, {min_shape}, and {max_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + min_dtype = _get_dtype(min, sycl_dev) + max_dtype = _get_dtype(max, sycl_dev) + if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)): + raise ValueError("Operands have unsupported data types") + + min_dtype, max_dtype = _resolve_one_strong_two_weak_types( + x_dtype, min_dtype, max_dtype, sycl_dev + ) + + buf1_dt, buf2_dt, res_dt = _check_clip_dtypes( + x_dtype, + min_dtype, + max_dtype, + sycl_dev, + ) + + if res_dt is None: + raise ValueError( + f"function '{clip}' does not support input types " + f"({x_dtype}, {min_dtype}, {max_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " + f"{type(out)}" + ) + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {res_shape}, " + f"got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " + f"got {out.dtype}" + ) + + if ( + dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) + is None + ): + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + if ( + ti._array_overlap(min, out) + and not ti._same_logical_tensors(min, out) + and buf1_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(max, dpt.usm_ndarray): + if ( + ti._array_overlap(max, out) + and not ti._same_logical_tensors(max, out) + and buf2_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + a_min = min + else: + a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q) + if isinstance(max, dpt.usm_ndarray): + a_max = max + else: + a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q) + + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + a_max, + ) + ) + else "C" + ) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, min=a_min, max=a_max, dst=out, sycl_queue=exec_q + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_binary_ev.wait() + return out + + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + ) + ) + else "C" + ) + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + buf2, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=a_min, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_max, + ) + ) + else "C" + ) + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + buf1, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=buf1, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + if order in ["K", "A"]: + if ( + x.flags.f_contiguous + and a_min.flags.f_contiguous + and a_max.flags.f_contiguous + ): + order = "F" + elif ( + x.flags.c_contiguous + and a_min.flags.c_contiguous + and a_max.flags.c_contiguous + ): + order = "C" + else: + order = "C" if order == "A" else "K" + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q + ) + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, _ = ti._clip( + src=x, + min=buf1, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + return out diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index fca5b0734a..baaac078b5 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -649,12 +649,7 @@ def __call__(self, o1, o2, out=None, order="K"): sycl_queue=exec_q, order=order, ) - else: - if res_dt != out.dtype: - raise TypeError( - f"Output array of type {res_dt} is needed," - f"got {out.dtype}" - ) + if src1.shape != res_shape: src1 = dpt.broadcast_to(src1, res_shape) buf2 = dpt.broadcast_to(buf2, res_shape) diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp new file mode 100644 index 0000000000..9cca9f615b --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -0,0 +1,311 @@ +//=== clip.hpp - Implementation of clip kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for dpctl.tensor.clip. +//===----------------------------------------------------------------------===// + +#pragma once +#include "pybind11/numpy.h" +#include "pybind11/stl.h" +#include +#include +#include +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace clip +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using namespace dpctl::tensor::offset_utils; + +template T clip(const T &x, const T &min, const T &max) +{ + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::max_complex; + using dpctl::tensor::math_utils::min_complex; + return min_complex(max_complex(x, min), max); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + auto tmp = (std::isnan(x) || x > min) ? x : min; + return (std::isnan(tmp) || tmp < max) ? tmp : max; + } + else if constexpr (std::is_same_v) { + return (x || min) && max; + } + else { + auto tmp = (x > min) ? x : min; + return (tmp < max) ? tmp : max; + } +} + +template class ClipContigFunctor +{ +private: + size_t nelems = 0; + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + +public: + ClipContigFunctor(size_t nelems_, + const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_) + : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_), + dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t base = ndit.get_global_linear_id(); + + base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + for (size_t offset = base; + offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * max_sgSize); + + if (base + n_vecs * vec_sz * sgSize < nelems && + sgSize == max_sgSize) { + sycl::vec x_vec; + sycl::vec min_vec; + sycl::vec max_vec; + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + auto idx = base + it * sgSize; + auto x_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x_p[idx]); + auto min_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&min_p[idx]); + auto max_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&max_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + x_vec = sg.load(x_multi_ptr); + min_vec = sg.load(min_multi_ptr); + max_vec = sg.load(max_multi_ptr); +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id], + max_vec[vec_id]); + } + sg.store(dst_multi_ptr, dst_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems; + k += sgSize) { + dst_p[k] = clip(x_p[k], min_p[k], max_p[k]); + } + } + } + } +}; + +template class clip_contig_kernel; + +typedef sycl::event (*clip_contig_impl_fn_ptr_t)( + sycl::queue &, + size_t, + const char *, + const char *, + const char *, + char *, + const std::vector &); + +template +sycl::event clip_contig_impl(sycl::queue &q, + size_t nelems, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + ClipContigFunctor(nelems, x_tp, min_tp, max_tp, + dst_tp)); + }); + + return clip_ev; +} + +template class ClipStridedFunctor +{ +private: + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + IndexerT indexer; + +public: + ClipStridedFunctor(const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_, + IndexerT indexer_) + : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_), + indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + size_t gid = id[0]; + auto offsets = indexer(static_cast(gid)); + dst_p[offsets.get_fourth_offset()] = clip( + x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()], + max_p[offsets.get_third_offset()]); + } +}; + +template class clip_strided_kernel; + +typedef sycl::event (*clip_strided_impl_fn_ptr_t)( + sycl::queue &, + size_t, + int, + const char *, + const char *, + const char *, + char *, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event clip_strided_impl(sycl::queue &q, + size_t nelems, + int nd, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const py::ssize_t *shape_strides, + py::ssize_t x_offset, + py::ssize_t min_offset, + py::ssize_t max_offset, + py::ssize_t dst_offset, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + FourOffsets_StridedIndexer indexer{ + nd, x_offset, min_offset, max_offset, dst_offset, shape_strides}; + + cgh.parallel_for>( + sycl::range<1>(nelems), + ClipStridedFunctor( + x_tp, min_tp, max_tp, dst_tp, indexer)); + }); + + return clip_ev; +} + +template struct ClipStridedFactory +{ + fnT get() + { + fnT fn = clip_strided_impl; + return fn; + } +}; + +template struct ClipContigFactory +{ + fnT get() + { + + fnT fn = clip_contig_impl; + return fn; + } +}; + +} // namespace clip +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp new file mode 100644 index 0000000000..ac494c19ae --- /dev/null +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -0,0 +1,269 @@ +//===-- clip.cpp - Implementation of clip --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.clip +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "clip.hpp" +#include "kernels/clip.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t; +using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t; + +static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types]; +static clip_strided_impl_fn_ptr_t + clip_strided_dispatch_vector[td_ns::num_types]; + +void init_clip_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::clip::ClipContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(clip_contig_dispatch_vector); + + using dpctl::tensor::kernels::clip::ClipStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(clip_strided_dispatch_vector); +} + +using dpctl::utils::keep_args_alive; + +std::pair +py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + int nd = src.get_ndim(); + int min_nd = min.get_ndim(); + int max_nd = max.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (nd != min_nd || nd != max_nd) { + throw py::value_error( + "Input arrays are not of appropriate dimension for clip kernel."); + } + + if (nd != dst_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for clip kernel."); + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *min_shape = min.get_shape_raw(); + const py::ssize_t *max_shape = max.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + size_t nelems(1); + for (int i = 0; i < nd; ++i) { + const auto &sh_i = dst_shape[i]; + nelems *= static_cast(sh_i); + shapes_equal = shapes_equal && (min_shape[i] == sh_i) && + (max_shape[i] == sh_i) && (src_shape[i] == sh_i); + } + + if (!shapes_equal) { + throw py::value_error("Arrays are not of matching shapes."); + } + + if (nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(dst, src) && !same_logical_tensors(dst, src)) || + (overlap(dst, min) && !same_logical_tensors(dst, min)) || + (overlap(dst, max) && !same_logical_tensors(dst, max))) + { + throw py::value_error("Destination array overlaps with input."); + } + + int min_typenum = min.get_typenum(); + int max_typenum = max.get_typenum(); + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int min_typeid = array_types.typenum_to_lookup_id(min_typenum); + int max_typeid = array_types.typenum_to_lookup_id(max_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid || src_typeid != min_typeid || + src_typeid != max_typeid) + { + throw py::value_error("Input, min, max, and destination arrays must " + "have the same data type"); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + char *src_data = src.get_data(); + char *min_data = min.get_data(); + char *max_data = max.get_data(); + char *dst_data = dst.get_data(); + + bool is_min_c_contig = min.is_c_contiguous(); + bool is_min_f_contig = min.is_f_contiguous(); + + bool is_max_c_contig = max.is_c_contiguous(); + bool is_max_f_contig = max.is_f_contiguous(); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = (is_min_c_contig && is_max_c_contig && + is_src_c_contig && is_dst_c_contig); + bool all_f_contig = (is_min_f_contig && is_max_f_contig && + is_src_f_contig && is_dst_f_contig); + + if (all_c_contig || all_f_contig) { + auto fn = clip_contig_dispatch_vector[src_typeid]; + + sycl::event clip_ev = + fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends); + sycl::event ht_ev = + keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev}); + + return std::make_pair(ht_ev, clip_ev); + } + + auto const &src_strides = src.get_strides_vector(); + auto const &min_strides = min.get_strides_vector(); + auto const &max_strides = max.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_min_strides; + shT simplified_max_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t min_offset(0); + py::ssize_t max_offset(0); + py::ssize_t dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space_4( + nd, src_shape, src_strides, min_strides, max_strides, dst_strides, + // outputs + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides, src_offset, min_offset, + max_offset, dst_offset); + + auto fn = clip_strided_dispatch_vector[src_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // common shape and strides + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides); + py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); + sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data, + dst_data, packed_shape_strides, src_offset, + min_offset, max_offset, dst_offset, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(clip_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([packed_shape_strides, ctx]() { + sycl::free(packed_shape_strides, ctx); + }); + }); + + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, min, max, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, clip_ev); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/clip.hpp b/dpctl/tensor/libtensor/source/clip.hpp new file mode 100644 index 0000000000..d4b8af2cf5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/clip.hpp @@ -0,0 +1,52 @@ +//===-- clip.hpp - --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.clip +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +extern void init_clip_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 0e8b4236b6..cc47ed1727 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -37,6 +37,7 @@ #include "accumulators.hpp" #include "boolean_advanced_indexing.hpp" #include "boolean_reductions.hpp" +#include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_for_roll.hpp" @@ -116,6 +117,9 @@ using dpctl::tensor::py_internal::usm_ndarray_triul; using dpctl::tensor::py_internal::py_where; +/* =========================== Clip ============================== */ +using dpctl::tensor::py_internal::py_clip; + // populate dispatch tables void init_dispatch_tables(void) { @@ -148,6 +152,8 @@ void init_dispatch_vectors(void) populate_cumsum_1d_dispatch_vectors(); init_repeat_dispatch_vectors(); + init_clip_dispatch_vectors(); + return; } @@ -441,6 +447,14 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_clip", &py_clip, + "Clamps elements of array `x` to the range " + "[`min`, `max] and writes the result to the " + "array `dst` for each element of `x`, `min`, and `max`." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + dpctl::tensor::py_internal::init_elementwise_functions(m); dpctl::tensor::py_internal::init_boolean_reduction_functions(m); dpctl::tensor::py_internal::init_reduction_functions(m); diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py new file mode 100644 index 0000000000..7050b17e7c --- /dev/null +++ b/dpctl/tests/test_tensor_clip.py @@ -0,0 +1,627 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from helper import get_queue_or_skip, skip_if_dtype_not_supported +from numpy.testing import assert_raises_regex + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast +from dpctl.utils import ExecutionPlacementError + +_all_dtypes = [ + "?", + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "e", + "f", + "d", + "F", + "D", +] + +_usm_types = ["device", "shared", "host"] + + +@pytest.mark.parametrize("dt1", _all_dtypes) +@pytest.mark.parametrize("dt2", _all_dtypes) +def test_clip_dtypes(dt1, dt2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q) + ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # also covers cases where dt1 == dt2 + if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64): + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=ar3, max=None) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=None, max=ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + else: + with pytest.raises(ValueError): + dpt.clip(ar1, ar2, ar3) + with pytest.raises(ValueError): + dpt.clip(ar1, min=ar3, max=None) + with pytest.raises(ValueError): + dpt.clip(ar1, min=None, max=ar3) + + +def test_clip_empty(): + get_queue_or_skip() + + x = dpt.empty((2, 0, 3), dtype="i4") + a_min = dpt.ones((2, 0, 3), dtype="i4") + a_max = dpt.ones((2, 0, 3), dtype="i4") + + r = dpt.clip(x, a_min, a_max) + assert r.size == 0 + assert r.shape == x.shape + + +def test_clip_python_scalars(): + get_queue_or_skip() + + arrs = [ + dpt.ones(1, dtype="?"), + dpt.ones(1, dtype="i4"), + dpt.ones(1, dtype="f4"), + dpt.ones(1, dtype="c8"), + ] + + py_zeros = [ + False, + 0, + 0.0, + complex(0, 0), + ] + + py_ones = [ + True, + 1, + 1.0, + complex(1, 0), + ] + + for zero, one, arr in zip(py_zeros, py_ones, arrs): + r = dpt.clip(arr, zero, one) + assert isinstance(r, dpt.usm_ndarray) + r = dpt.clip(arr, min=zero) + assert isinstance(r, dpt.usm_ndarray) + + +def test_clip_in_place(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + a_min = dpt.arange(1, 11, dtype="i4") + a_max = dpt.arange(2, 12, dtype="i4") + dpt.clip(x, a_min, a_max, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, a_min, a_max, out=a_max) + assert dpt.all(a_max == a_min) + + a_min = dpt.arange(1, 11, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=a_min[::-1]) + assert dpt.all((x + 1)[::-1] == a_min) + + +def test_clip_special_cases(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="f4") + r = dpt.clip(x, -dpt.inf, dpt.inf) + assert dpt.all(r == x) + r = dpt.clip(x, dpt.nan, dpt.inf) + assert dpt.all(dpt.isnan(r)) + r = dpt.clip(x, -dpt.inf, dpt.nan) + assert dpt.all(dpt.isnan(r)) + + +def test_clip_out_need_temporary(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i2") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i1") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i4") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i2") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_out_need_temporary_none(): + get_queue_or_skip() + + x = dpt.full(6, 3, dtype="i4") + # with min/max == None + a_min = dpt.full(10, 2, dtype="i4") + dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_arg_validation(): + get_queue_or_skip() + + check = dict() + x1 = dpt.empty((1,), dtype="i4") + x2 = dpt.empty((1,), dtype="i4") + + with pytest.raises(TypeError): + dpt.clip(check, x1, x2) + + +@pytest.mark.parametrize( + "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")] +) +def test_clip_order(dt1, dt2): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt1, order="C") + ar3 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt1, order="F") + ar3 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("dt", ["i4", "i2"]) +def test_clip_none_order(dt): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt, order="C") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt, order="F") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2] + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +@pytest.mark.parametrize("usm_type3", _usm_types) +def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3) + + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (usm_type1, usm_type2, usm_type3) + ) + assert r.usm_type == expected_usm_type + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + + r = dpt.clip(ar1, min=ar2, max=None) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2)) + assert r.usm_type == expected_usm_type + + +def test_clip_dtype_error(): + get_queue_or_skip() + + ar1 = dpt.ones(1, dtype="i4") + ar2 = dpt.ones(1, dtype="i4") + ar3 = dpt.ones(1, dtype="i4") + ar4 = dpt.empty_like(ar1, dtype="f4") + + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue) + ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue) + assert_raises_regex( + ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + None, + ar3, + ar4, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + ar2, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + 1, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + 1, + ar4, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + None, + ar2, + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="float32") + ar3 = dpt.ones_like(ar1, dtype="float32") + ar4 = dpt.empty(3, dtype="float32") + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + ar1 = np.ones(2, dtype="f4") + ar2 = dpt.ones(2, dtype="f4") + ar3 = dpt.ones(2, dtype="f4") + assert_raises_regex( + TypeError, + "Expected `x` to be of dpctl.tensor.usm_ndarray type*", + dpt.clip, + ar1, + ar2, + ar3, + ) + + ar1 = dpt.ones(2, dtype="i4") + ar2 = dpt.ones_like(ar1, dtype="i4") + ar3 = dpt.ones_like(ar1, dtype="i4") + ar4 = np.empty_like(ar1) + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_out_type_check(): + get_queue_or_skip() + + x1 = dpt.ones(10) + x2 = dpt.ones(10) + x3 = dpt.ones(10) + + out = range(10) + + with pytest.raises(TypeError): + dpt.clip(x1, x2, x3, out=out) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_basic(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q) + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q) + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_strided(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 2 * 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2] + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + expected = expected[::-2] + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2] + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + a_max = a_max[::-2] + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +def test_clip_max_less_than_min(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + res = dpt.clip(x, 5, 0) + assert dpt.all(res == 0) + + +def test_clip_minmax_weak_types(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype=dpt.bool) + min_list = [False, 0, 0.0, 0.0 + 0.0j] + max_list = [True, 1, 1.0, 1.0 + 0.0j] + for min_v, max_v in zip(min_list, max_list): + if isinstance(min_v, bool) and isinstance(max_v, bool): + y = dpt.clip(x, min_v, max_v) + assert isinstance(y, dpt.usm_ndarray) + else: + with pytest.raises(ValueError): + dpt.clip(x, min_v, max_v) + + +def test_clip_max_weak_types(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype="i4") + m = dpt.ones(10, dtype="i4") + + with pytest.raises(ValueError): + dpt.clip(x, m, 2.5) + + with pytest.raises(ValueError): + dpt.clip(x, 2.5, m)