diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index aba009411d..8e1c3b3b57 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -50,6 +50,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
 
 set(python_module_name _tensor_impl)
@@ -65,6 +66,7 @@ set(_no_fast_math_sources
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
 foreach(_src_fn ${_no_fast_math_sources})
   get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index bab31379b7..209a6d4e56 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -93,6 +93,7 @@
 from dpctl.tensor._usmarray import usm_ndarray
 from dpctl.tensor._utility_functions import all, any
 
+from ._clip import clip
 from ._constants import e, inf, nan, newaxis, pi
 from ._elementwise_funcs import (
     abs,
@@ -322,4 +323,5 @@
     "exp2",
     "copysign",
     "rsqrt",
+    "clip",
 ]
diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py
new file mode 100644
index 0000000000..5a3a96933f
--- /dev/null
+++ b/dpctl/tensor/_clip.py
@@ -0,0 +1,837 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_impl as ti
+from dpctl.tensor._copy_utils import (
+    _empty_like_orderK,
+    _empty_like_pair_orderK,
+    _empty_like_triple_orderK,
+)
+from dpctl.tensor._elementwise_common import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _strong_dtype_num_kind,
+    _validate_dtype,
+    _weak_type_num_kind,
+)
+from dpctl.tensor._manipulation_functions import _broadcast_shape_impl
+from dpctl.tensor._type_utils import _can_cast, _to_device_supported_dtype
+from dpctl.utils import ExecutionPlacementError
+
+
+def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev):
+    "Resolves weak data types per NEP-0050,"
+    "where the second and third arguments are"
+    "permitted to be weak types"
+    if isinstance(
+        st_dtype,
+        (
+            WeakBooleanType,
+            WeakIntegralType,
+            WeakFloatingType,
+            WeakComplexType,
+        ),
+    ):
+        raise ValueError
+    if isinstance(
+        dtype1,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        if isinstance(
+            dtype2,
+            (
+                WeakBooleanType,
+                WeakIntegralType,
+                WeakFloatingType,
+                WeakComplexType,
+            ),
+        ):
+            kind_num1 = _weak_type_num_kind(dtype1)
+            kind_num2 = _weak_type_num_kind(dtype2)
+            st_kind_num = _strong_dtype_num_kind(st_dtype)
+
+            if kind_num1 > st_kind_num:
+                if isinstance(dtype1, WeakIntegralType):
+                    ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype1, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype1 = dpt.complex64
+                    ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype1 = st_dtype
+
+            if kind_num2 > st_kind_num:
+                if isinstance(dtype2, WeakIntegralType):
+                    ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype2, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype2 = dpt.complex64
+                    ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype2 = st_dtype
+
+            return ret_dtype1, ret_dtype2
+
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype2), dtype2),
+            ]
+        )
+        dt1_kind_num = _weak_type_num_kind(dtype1)
+        if dt1_kind_num > max_dt_num_kind:
+            if isinstance(dtype1, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), dtype2
+            if isinstance(dtype1, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dpt.complex64, dtype2
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    dtype2,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), dtype2
+        else:
+            return max_dtype, dtype2
+    elif isinstance(
+        dtype2,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype1), dtype1),
+            ]
+        )
+        dt2_kind_num = _weak_type_num_kind(dtype2)
+        if dt2_kind_num > max_dt_num_kind:
+            if isinstance(dtype2, WeakIntegralType):
+                return dtype1, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype2, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dtype1, dpt.complex64
+                return (
+                    dtype1,
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                )
+            return dtype1, _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return dtype1, max_dtype
+    else:
+        # both are strong dtypes
+        # return unmodified
+        return dtype1, dtype2
+
+
+def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev):
+    "Resolves one weak data type with one strong data type per NEP-0050"
+    if isinstance(
+        st_dtype,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        raise ValueError
+    if isinstance(
+        dtype,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ):
+        st_kind_num = _strong_dtype_num_kind(st_dtype)
+        kind_num = _weak_type_num_kind(dtype)
+        if kind_num > st_kind_num:
+            if isinstance(dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype, WeakComplexType):
+                if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                    return st_dtype, dpt.complex64
+                return _to_device_supported_dtype(dpt.complex128, dev)
+            return (_to_device_supported_dtype(dpt.float64, dev),)
+        else:
+            return st_dtype
+    else:
+        return dtype
+
+
+def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev):
+    "Checks if both types `arg1_dtype` and `arg2_dtype` can be"
+    "cast to `res_dtype` according to the rule `safe`"
+    if arg1_dtype == res_dtype and arg2_dtype == res_dtype:
+        return None, None, res_dtype
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast(
+        arg2_dtype, res_dtype, _fp16, _fp64
+    ):
+        # prevent unnecessary casting
+        ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype
+        ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype
+        return ret_buf1_dt, ret_buf2_dt, res_dtype
+    else:
+        return None, None, None
+
+
+def _clip_none(x, val, out, order, _binary_fn):
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    q1, x_usm_type = x.sycl_queue, x.usm_type
+    q2, val_usm_type = _get_queue_usm_type(val)
+    if q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                x_usm_type,
+                val_usm_type,
+            )
+        )
+    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    x_shape = x.shape
+    val_shape = _get_shape(val)
+    if not isinstance(val_shape, (tuple, list)):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                x_shape,
+                val_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{x_shape} and {val_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x_dtype = x.dtype
+    val_dtype = _get_dtype(val, sycl_dev)
+    if not _validate_dtype(val_dtype):
+        raise ValueError("Operands have unsupported data types")
+
+    val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev)
+
+    res_dt = x.dtype
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if not _can_cast(val_dtype, res_dt, _fp16, _fp64):
+        raise ValueError(
+            f"function 'clip' does not support input types "
+            f"({x_dtype}, {val_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(x, out):
+            if not ti._same_logical_tensors(x, out):
+                out = dpt.empty_like(out)
+
+        if isinstance(val, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(val, out)
+                and not ti._same_logical_tensors(val, out)
+                and val_dtype == res_dt
+            ):
+                out = dpt.empty_like(out)
+
+    if isinstance(val, dpt.usm_ndarray):
+        val_ary = val
+    else:
+        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+
+    if val_dtype == res_dt:
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, val_ary, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                if order == "A":
+                    order = (
+                        "F"
+                        if all(
+                            arr.flags.f_contiguous
+                            for arr in (
+                                x,
+                                val_ary,
+                            )
+                        )
+                        else "C"
+                    )
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        if val_ary.shape != res_shape:
+            val_ary = dpt.broadcast_to(val_ary, res_shape)
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x, src2=val_ary, dst=out, sycl_queue=exec_q
+        )
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            ht_copy_out_ev.wait()
+            out = orig_out
+        ht_binary_ev.wait()
+        return out
+    else:
+        if order == "K":
+            buf = _empty_like_orderK(val_ary, res_dt)
+        else:
+            if order == "A":
+                order = "F" if x.flags.f_contiguous else "C"
+            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=val_ary, dst=buf, sycl_queue=exec_q
+        )
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, buf, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        buf = dpt.broadcast_to(buf, res_shape)
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x,
+            src2=buf,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            ht_copy_out_ev.wait()
+            out = orig_out
+        ht_copy_ev.wait()
+        ht_binary_ev.wait()
+        return out
+
+
+# need to handle logic for min or max being None
+def clip(x, min=None, max=None, out=None, order="K"):
+    """clip(x, min, max, out=None, order="K")
+
+    Clips to the range [`min_i`, `max_i`] for each element `x_i`
+    in `x`.
+
+    Args:
+        x (usm_ndarray): Array containing elements to clip.
+            Must be compatible with `min` and `max` according
+            to broadcasting rules.
+        min ({None, usm_ndarray}, optional): Array containing minimum values.
+            Must be compatible with `x` and `max` according
+            to broadcasting rules.
+            Only one of `min` and `max` can be `None`.
+        max ({None, usm_ndarray}, optional): Array containing maximum values.
+            Must be compatible with `x` and `min` according
+            to broadcasting rules.
+            Only one of `min` and `max` can be `None`.
+        out ({None, usm_ndarray}, optional):
+            Output array to populate.
+            Array must have the correct shape and the expected data type.
+        order ("C","F","A","K", optional):
+            Memory layout of the newly output array, if parameter `out` is
+            `None`.
+            Default: "K".
+
+    Returns:
+        usm_ndarray:
+            An array with elements clipped to the range [`min`, `max`].
+            The returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected `x` to be of dpctl.tensor.usm_ndarray type, got "
+            f"{type(x)}"
+        )
+    if min is None and max is None:
+        raise ValueError(
+            "only one of `min` and `max` is permitted to be `None`"
+        )
+    elif max is None:
+        return _clip_none(x, min, out, order, ti._maximum)
+    elif min is None:
+        return _clip_none(x, max, out, order, ti._minimum)
+    else:
+        q1, x_usm_type = x.sycl_queue, x.usm_type
+        q2, min_usm_type = _get_queue_usm_type(min)
+        q3, max_usm_type = _get_queue_usm_type(max)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            res_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    max_usm_type,
+                )
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                    max_usm_type,
+                )
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        x_shape = x.shape
+        min_shape = _get_shape(min)
+        max_shape = _get_shape(max)
+        if not all(
+            isinstance(s, (tuple, list))
+            for s in (
+                min_shape,
+                max_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    x_shape,
+                    min_shape,
+                    max_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{x_shape}, {min_shape}, and {max_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        x_dtype = x.dtype
+        min_dtype = _get_dtype(min, sycl_dev)
+        max_dtype = _get_dtype(max, sycl_dev)
+        if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)):
+            raise ValueError("Operands have unsupported data types")
+
+        min_dtype, max_dtype = _resolve_one_strong_two_weak_types(
+            x_dtype, min_dtype, max_dtype, sycl_dev
+        )
+
+        buf1_dt, buf2_dt, res_dt = _check_clip_dtypes(
+            x_dtype,
+            min_dtype,
+            max_dtype,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{clip}' does not support input types "
+                f"({x_dtype}, {min_dtype}, {max_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if out.shape != res_shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {res_shape}, "
+                    f"got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+
+            if isinstance(min, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(min, out)
+                    and not ti._same_logical_tensors(min, out)
+                    and buf1_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+            if isinstance(max, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(max, out)
+                    and not ti._same_logical_tensors(max, out)
+                    and buf2_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+        if isinstance(min, dpt.usm_ndarray):
+            a_min = min
+        else:
+            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+        if isinstance(max, dpt.usm_ndarray):
+            a_max = max
+        else:
+            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+
+        if buf1_dt is None and buf2_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    if order == "A":
+                        order = (
+                            "F"
+                            if all(
+                                arr.flags.f_contiguous
+                                for arr in (
+                                    x,
+                                    a_min,
+                                    a_max,
+                                )
+                            )
+                            else "C"
+                        )
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+            if x_shape != res_shape:
+                x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x, min=a_min, max=a_max, dst=out, sycl_queue=exec_q
+            )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
+            ht_binary_ev.wait()
+            return out
+
+        elif buf1_dt is None:
+            if order == "K":
+                buf2 = _empty_like_orderK(a_max, buf2_dt)
+            else:
+                if order == "A":
+                    order = (
+                        "F"
+                        if all(
+                            arr.flags.f_contiguous
+                            for arr in (
+                                x,
+                                a_min,
+                            )
+                        )
+                        else "C"
+                    )
+                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_max, dst=buf2, sycl_queue=exec_q
+            )
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        buf2,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=buf2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
+            ht_copy_ev.wait()
+            ht_binary_ev.wait()
+            return out
+
+        elif buf2_dt is None:
+            if order == "K":
+                buf1 = _empty_like_orderK(a_min, buf1_dt)
+            else:
+                if order == "A":
+                    order = (
+                        "F"
+                        if all(
+                            arr.flags.f_contiguous
+                            for arr in (
+                                x,
+                                a_max,
+                            )
+                        )
+                        else "C"
+                    )
+                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_min, dst=buf1, sycl_queue=exec_q
+            )
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        buf1,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=buf1,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
+            ht_copy_ev.wait()
+            ht_binary_ev.wait()
+            return out
+
+        if order in ["K", "A"]:
+            if (
+                x.flags.f_contiguous
+                and a_min.flags.f_contiguous
+                and a_max.flags.f_contiguous
+            ):
+                order = "F"
+            elif (
+                x.flags.c_contiguous
+                and a_min.flags.c_contiguous
+                and a_max.flags.c_contiguous
+            ):
+                order = "C"
+            else:
+                order = "C" if order == "A" else "K"
+        if order == "K":
+            buf1 = _empty_like_orderK(a_min, buf1_dt)
+        else:
+            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_min, dst=buf1, sycl_queue=exec_q
+        )
+        if order == "K":
+            buf2 = _empty_like_orderK(a_max, buf2_dt)
+        else:
+            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_max, dst=buf2, sycl_queue=exec_q
+        )
+        if out is None:
+            if order == "K":
+                out = _empty_like_triple_orderK(
+                    x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        x = dpt.broadcast_to(x, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
+        ht_, _ = ti._clip(
+            src=x,
+            min=buf1,
+            max=buf2,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy1_ev, copy2_ev],
+        )
+        dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_])
+        return out
diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index fca5b0734a..baaac078b5 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -649,12 +649,7 @@ def __call__(self, o1, o2, out=None, order="K"):
                         sycl_queue=exec_q,
                         order=order,
                     )
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f"got {out.dtype}"
-                    )
+
             if src1.shape != res_shape:
                 src1 = dpt.broadcast_to(src1, res_shape)
             buf2 = dpt.broadcast_to(buf2, res_shape)
diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp
new file mode 100644
index 0000000000..9cca9f615b
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp
@@ -0,0 +1,311 @@
+//=== clip.hpp -  Implementation of clip kernels ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.clip.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include "pybind11/numpy.h"
+#include "pybind11/stl.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <pybind11/pybind11.h>
+#include <type_traits>
+
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace clip
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using namespace dpctl::tensor::offset_utils;
+
+template <typename T> T clip(const T &x, const T &min, const T &max)
+{
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr (is_complex<T>::value) {
+        using dpctl::tensor::math_utils::max_complex;
+        using dpctl::tensor::math_utils::min_complex;
+        return min_complex(max_complex(x, min), max);
+    }
+    else if constexpr (std::is_floating_point_v<T> ||
+                       std::is_same_v<T, sycl::half>) {
+        auto tmp = (std::isnan(x) || x > min) ? x : min;
+        return (std::isnan(tmp) || tmp < max) ? tmp : max;
+    }
+    else if constexpr (std::is_same_v<T, bool>) {
+        return (x || min) && max;
+    }
+    else {
+        auto tmp = (x > min) ? x : min;
+        return (tmp < max) ? tmp : max;
+    }
+}
+
+template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
+{
+private:
+    size_t nelems = 0;
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    ClipContigFunctor(size_t nelems_,
+                      const T *x_p_,
+                      const T *min_p_,
+                      const T *max_p_,
+                      T *dst_p_)
+        : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<T>::value) {
+            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            size_t base = ndit.get_global_linear_id();
+
+            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
+            for (size_t offset = base;
+                 offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
+                 offset += sgSize)
+            {
+                dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            std::uint8_t sgSize = sg.get_local_range()[0];
+            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
+            size_t base = n_vecs * vec_sz *
+                          (ndit.get_group(0) * ndit.get_local_range(0) +
+                           sg.get_group_id()[0] * max_sgSize);
+
+            if (base + n_vecs * vec_sz * sgSize < nelems &&
+                sgSize == max_sgSize) {
+                sycl::vec<T, vec_sz> x_vec;
+                sycl::vec<T, vec_sz> min_vec;
+                sycl::vec<T, vec_sz> max_vec;
+                sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    auto idx = base + it * sgSize;
+                    auto x_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x_p[idx]);
+                    auto min_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&min_p[idx]);
+                    auto max_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&max_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    x_vec = sg.load<vec_sz>(x_multi_ptr);
+                    min_vec = sg.load<vec_sz>(min_multi_ptr);
+                    max_vec = sg.load<vec_sz>(max_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
+                                               max_vec[vec_id]);
+                    }
+                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
+                }
+            }
+            else {
+                for (size_t k = base + sg.get_local_id()[0]; k < nelems;
+                     k += sgSize) {
+                    dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
+                }
+            }
+        }
+    }
+};
+
+template <typename T, int vec_sz, int n_vecs> class clip_contig_kernel;
+
+typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_contig_impl(sycl::queue &q,
+                             size_t nelems,
+                             const char *x_cp,
+                             const char *min_cp,
+                             const char *max_cp,
+                             char *dst_cp,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        size_t lws = 64;
+        constexpr unsigned int vec_sz = 4;
+        constexpr unsigned int n_vecs = 2;
+        const size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        cgh.parallel_for<clip_contig_kernel<T, vec_sz, n_vecs>>(
+            sycl::nd_range<1>(gws_range, lws_range),
+            ClipContigFunctor<T, vec_sz, n_vecs>(nelems, x_tp, min_tp, max_tp,
+                                                 dst_tp));
+    });
+
+    return clip_ev;
+}
+
+template <typename T, typename IndexerT> class ClipStridedFunctor
+{
+private:
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT indexer;
+
+public:
+    ClipStridedFunctor(const T *x_p_,
+                       const T *min_p_,
+                       const T *max_p_,
+                       T *dst_p_,
+                       IndexerT indexer_)
+        : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        size_t gid = id[0];
+        auto offsets = indexer(static_cast<py::ssize_t>(gid));
+        dst_p[offsets.get_fourth_offset()] = clip(
+            x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
+            max_p[offsets.get_third_offset()]);
+    }
+};
+
+template <typename T, typename IndexerT> class clip_strided_kernel;
+
+typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_strided_impl(sycl::queue &q,
+                              size_t nelems,
+                              int nd,
+                              const char *x_cp,
+                              const char *min_cp,
+                              const char *max_cp,
+                              char *dst_cp,
+                              const py::ssize_t *shape_strides,
+                              py::ssize_t x_offset,
+                              py::ssize_t min_offset,
+                              py::ssize_t max_offset,
+                              py::ssize_t dst_offset,
+                              const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        FourOffsets_StridedIndexer indexer{
+            nd, x_offset, min_offset, max_offset, dst_offset, shape_strides};
+
+        cgh.parallel_for<clip_strided_kernel<T, FourOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            ClipStridedFunctor<T, FourOffsets_StridedIndexer>(
+                x_tp, min_tp, max_tp, dst_tp, indexer));
+    });
+
+    return clip_ev;
+}
+
+template <typename fnT, typename T> struct ClipStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = clip_strided_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct ClipContigFactory
+{
+    fnT get()
+    {
+
+        fnT fn = clip_contig_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace clip
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp
new file mode 100644
index 0000000000..ac494c19ae
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/clip.cpp
@@ -0,0 +1,269 @@
+//===-- clip.cpp - Implementation of clip  --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <complex>
+#include <cstdint>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+
+#include "clip.hpp"
+#include "kernels/clip.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t;
+
+static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types];
+static clip_strided_impl_fn_ptr_t
+    clip_strided_dispatch_vector[td_ns::num_types];
+
+void init_clip_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::clip::ClipContigFactory;
+    DispatchVectorBuilder<clip_contig_impl_fn_ptr_t, ClipContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(clip_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::clip::ClipStridedFactory;
+    DispatchVectorBuilder<clip_strided_impl_fn_ptr_t, ClipStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(clip_strided_dispatch_vector);
+}
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+py_clip(const dpctl::tensor::usm_ndarray &src,
+        const dpctl::tensor::usm_ndarray &min,
+        const dpctl::tensor::usm_ndarray &max,
+        const dpctl::tensor::usm_ndarray &dst,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    int nd = src.get_ndim();
+    int min_nd = min.get_ndim();
+    int max_nd = max.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != min_nd || nd != max_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for clip kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for clip kernel.");
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *min_shape = min.get_shape_raw();
+    const py::ssize_t *max_shape = max.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<size_t>(sh_i);
+        shapes_equal = shapes_equal && (min_shape[i] == sh_i) &&
+                       (max_shape[i] == sh_i) && (src_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Arrays are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
+        (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
+        (overlap(dst, max) && !same_logical_tensors(dst, max)))
+    {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int min_typenum = min.get_typenum();
+    int max_typenum = max.get_typenum();
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int min_typeid = array_types.typenum_to_lookup_id(min_typenum);
+    int max_typeid = array_types.typenum_to_lookup_id(max_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid || src_typeid != min_typeid ||
+        src_typeid != max_typeid)
+    {
+        throw py::value_error("Input, min, max, and destination arrays must "
+                              "have the same data type");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accommodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(nelems)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accommodate all the "
+                "array elements.");
+        }
+    }
+
+    char *src_data = src.get_data();
+    char *min_data = min.get_data();
+    char *max_data = max.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_min_c_contig = min.is_c_contiguous();
+    bool is_min_f_contig = min.is_f_contiguous();
+
+    bool is_max_c_contig = max.is_c_contiguous();
+    bool is_max_f_contig = max.is_f_contiguous();
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_min_c_contig && is_max_c_contig &&
+                         is_src_c_contig && is_dst_c_contig);
+    bool all_f_contig = (is_min_f_contig && is_max_f_contig &&
+                         is_src_f_contig && is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = clip_contig_dispatch_vector[src_typeid];
+
+        sycl::event clip_ev =
+            fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev});
+
+        return std::make_pair(ht_ev, clip_ev);
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &min_strides = min.get_strides_vector();
+    auto const &max_strides = max.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_min_strides;
+    shT simplified_max_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t min_offset(0);
+    py::ssize_t max_offset(0);
+    py::ssize_t dst_offset(0);
+
+    dpctl::tensor::py_internal::simplify_iteration_space_4(
+        nd, src_shape, src_strides, min_strides, max_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides, src_offset, min_offset,
+        max_offset, dst_offset);
+
+    auto fn = clip_strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides);
+    py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple);
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data,
+                             dst_data, packed_shape_strides, src_offset,
+                             min_offset, max_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(clip_ev);
+        const auto &ctx = exec_q.get_context();
+        cgh.host_task([packed_shape_strides, ctx]() {
+            sycl::free(packed_shape_strides, ctx);
+        });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, min, max, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, clip_ev);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/clip.hpp b/dpctl/tensor/libtensor/source/clip.hpp
new file mode 100644
index 0000000000..d4b8af2cf5
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/clip.hpp
@@ -0,0 +1,52 @@
+//===--                      clip.hpp -                       --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.clip
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+py_clip(const dpctl::tensor::usm_ndarray &src,
+        const dpctl::tensor::usm_ndarray &min,
+        const dpctl::tensor::usm_ndarray &max,
+        const dpctl::tensor::usm_ndarray &dst,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends);
+
+extern void init_clip_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 0e8b4236b6..cc47ed1727 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -37,6 +37,7 @@
 #include "accumulators.hpp"
 #include "boolean_advanced_indexing.hpp"
 #include "boolean_reductions.hpp"
+#include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
 #include "copy_for_roll.hpp"
@@ -116,6 +117,9 @@ using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 using dpctl::tensor::py_internal::py_where;
 
+/* =========================== Clip ============================== */
+using dpctl::tensor::py_internal::py_clip;
+
 // populate dispatch tables
 void init_dispatch_tables(void)
 {
@@ -148,6 +152,8 @@ void init_dispatch_vectors(void)
     populate_cumsum_1d_dispatch_vectors();
     init_repeat_dispatch_vectors();
 
+    init_clip_dispatch_vectors();
+
     return;
 }
 
@@ -441,6 +447,14 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
+    m.def("_clip", &py_clip,
+          "Clamps elements of array `x` to the range "
+          "[`min`, `max] and writes the result to the "
+          "array `dst` for each element of `x`, `min`, and `max`."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
     dpctl::tensor::py_internal::init_elementwise_functions(m);
     dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
     dpctl::tensor::py_internal::init_reduction_functions(m);
diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py
new file mode 100644
index 0000000000..7050b17e7c
--- /dev/null
+++ b/dpctl/tests/test_tensor_clip.py
@@ -0,0 +1,627 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from helper import get_queue_or_skip, skip_if_dtype_not_supported
+from numpy.testing import assert_raises_regex
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import _can_cast
+from dpctl.utils import ExecutionPlacementError
+
+_all_dtypes = [
+    "?",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+_usm_types = ["device", "shared", "host"]
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+def test_clip_dtypes(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q)
+    ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # also covers cases where dt1 == dt2
+    if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64):
+        r = dpt.clip(ar1, ar2, ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=ar3, max=None)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=None, max=ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+    else:
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, ar2, ar3)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=ar3, max=None)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=None, max=ar3)
+
+
+def test_clip_empty():
+    get_queue_or_skip()
+
+    x = dpt.empty((2, 0, 3), dtype="i4")
+    a_min = dpt.ones((2, 0, 3), dtype="i4")
+    a_max = dpt.ones((2, 0, 3), dtype="i4")
+
+    r = dpt.clip(x, a_min, a_max)
+    assert r.size == 0
+    assert r.shape == x.shape
+
+
+def test_clip_python_scalars():
+    get_queue_or_skip()
+
+    arrs = [
+        dpt.ones(1, dtype="?"),
+        dpt.ones(1, dtype="i4"),
+        dpt.ones(1, dtype="f4"),
+        dpt.ones(1, dtype="c8"),
+    ]
+
+    py_zeros = [
+        False,
+        0,
+        0.0,
+        complex(0, 0),
+    ]
+
+    py_ones = [
+        True,
+        1,
+        1.0,
+        complex(1, 0),
+    ]
+
+    for zero, one, arr in zip(py_zeros, py_ones, arrs):
+        r = dpt.clip(arr, zero, one)
+        assert isinstance(r, dpt.usm_ndarray)
+        r = dpt.clip(arr, min=zero)
+        assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_clip_in_place():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    a_min = dpt.arange(1, 11, dtype="i4")
+    a_max = dpt.arange(2, 12, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=a_max)
+    assert dpt.all(a_max == a_min)
+
+    a_min = dpt.arange(1, 11, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=a_min[::-1])
+    assert dpt.all((x + 1)[::-1] == a_min)
+
+
+def test_clip_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="f4")
+    r = dpt.clip(x, -dpt.inf, dpt.inf)
+    assert dpt.all(r == x)
+    r = dpt.clip(x, dpt.nan, dpt.inf)
+    assert dpt.all(dpt.isnan(r))
+    r = dpt.clip(x, -dpt.inf, dpt.nan)
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_clip_out_need_temporary():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i2")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i1")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i2")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_out_need_temporary_none():
+    get_queue_or_skip()
+
+    x = dpt.full(6, 3, dtype="i4")
+    # with min/max == None
+    a_min = dpt.full(10, 2, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_arg_validation():
+    get_queue_or_skip()
+
+    check = dict()
+    x1 = dpt.empty((1,), dtype="i4")
+    x2 = dpt.empty((1,), dtype="i4")
+
+    with pytest.raises(TypeError):
+        dpt.clip(check, x1, x2)
+
+
+@pytest.mark.parametrize(
+    "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")]
+)
+def test_clip_order(dt1, dt2):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="C")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="C")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="F")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="F")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("dt", ["i4", "i2"])
+def test_clip_none_order(dt):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="C")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="F")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2]
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+@pytest.mark.parametrize("usm_type3", _usm_types)
+def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+    ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3)
+
+    r = dpt.clip(ar1, ar2, ar3)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (usm_type1, usm_type2, usm_type3)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+
+    r = dpt.clip(ar1, min=ar2, max=None)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2))
+    assert r.usm_type == expected_usm_type
+
+
+def test_clip_dtype_error():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones(1, dtype="i4")
+    ar2 = dpt.ones(1, dtype="i4")
+    ar3 = dpt.ones(1, dtype="i4")
+    ar4 = dpt.empty_like(ar1, dtype="f4")
+
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue)
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        None,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        ar2,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        1,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        1,
+        ar4,
+        ar3,
+    )
+
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        None,
+        ar2,
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="float32")
+    ar3 = dpt.ones_like(ar1, dtype="float32")
+    ar4 = dpt.empty(3, dtype="float32")
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+    ar1 = np.ones(2, dtype="f4")
+    ar2 = dpt.ones(2, dtype="f4")
+    ar3 = dpt.ones(2, dtype="f4")
+    assert_raises_regex(
+        TypeError,
+        "Expected `x` to be of dpctl.tensor.usm_ndarray type*",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+    )
+
+    ar1 = dpt.ones(2, dtype="i4")
+    ar2 = dpt.ones_like(ar1, dtype="i4")
+    ar3 = dpt.ones_like(ar1, dtype="i4")
+    ar4 = np.empty_like(ar1)
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_out_type_check():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10)
+    x2 = dpt.ones(10)
+    x3 = dpt.ones(10)
+
+    out = range(10)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x2, x3, out=out)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_strided(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 2 * 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2]
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    expected = expected[::-2]
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2]
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    a_max = a_max[::-2]
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+def test_clip_max_less_than_min():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    res = dpt.clip(x, 5, 0)
+    assert dpt.all(res == 0)
+
+
+def test_clip_minmax_weak_types():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype=dpt.bool)
+    min_list = [False, 0, 0.0, 0.0 + 0.0j]
+    max_list = [True, 1, 1.0, 1.0 + 0.0j]
+    for min_v, max_v in zip(min_list, max_list):
+        if isinstance(min_v, bool) and isinstance(max_v, bool):
+            y = dpt.clip(x, min_v, max_v)
+            assert isinstance(y, dpt.usm_ndarray)
+        else:
+            with pytest.raises(ValueError):
+                dpt.clip(x, min_v, max_v)
+
+
+def test_clip_max_weak_types():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="i4")
+    m = dpt.ones(10, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, m, 2.5)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 2.5, m)