From bab357114b8de7f911b9f54667ced67c9a0ca0fc Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 9 Sep 2024 20:06:54 -0700
Subject: [PATCH 01/10] Implements `BinaryElementwiseFunc._inplace_op` method

This method permits casting behavior equivalent to `"same_kind"` when using in-place operators by introducing the `_inplace_op` method

Expands this to `__imatmul__` as well through use of the already-implemented `dtype` keyword
---
 dpctl/tensor/_elementwise_common.py | 130 +++++++++++++++++++++++++++-
 dpctl/tensor/_type_utils.py         |  16 ++++
 dpctl/tensor/_usmarray.pyx          |  26 +++---
 3 files changed, 157 insertions(+), 15 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 6b38444902..991c7ca303 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -37,6 +37,7 @@
     _all_data_types,
     _find_buf_dtype,
     _find_buf_dtype2,
+    _find_buf_dtype_in_place_op,
     _resolve_weak_types,
     _to_device_supported_dtype,
 )
@@ -213,7 +214,7 @@ def __call__(self, x, /, *, out=None, order="K"):
 
             if res_dt != out.dtype:
                 raise ValueError(
-                    f"Output array of type {res_dt} is needed,"
+                    f"Output array of type {res_dt} is needed, "
                     f" got {out.dtype}"
                 )
 
@@ -650,7 +651,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
 
             if res_dt != out.dtype:
                 raise ValueError(
-                    f"Output array of type {res_dt} is needed,"
+                    f"Output array of type {res_dt} is needed, "
                     f"got {out.dtype}"
                 )
 
@@ -927,3 +928,128 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         )
         _manager.add_event_pair(ht_, bf_ev)
         return out
+
+    def _inplace_op(self, o1, o2):
+        if not isinstance(o1, dpt.usm_ndarray):
+            raise TypeError(
+                "Expected first argument to be "
+                f"dpctl.tensor.usm_ndarray, got {type(o1)}"
+            )
+        if not o1.flags.writable:
+            raise ValueError("provided left-hand side array is read-only")
+        q1, o1_usm_type = o1.sycl_queue, o1.usm_type
+        q2, o2_usm_type = _get_queue_usm_type(o2)
+        if q2 is None:
+            exec_q = q1
+            res_usm_type = o1_usm_type
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    o1_usm_type,
+                    o2_usm_type,
+                )
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        o1_shape = o1.shape
+        o2_shape = _get_shape(o2)
+        if not isinstance(o2_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of second argument can not be inferred. "
+                "Expected list or tuple."
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    o1_shape,
+                    o2_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{o1_shape} and {o2_shape}"
+            )
+        if res_shape != o1_shape:
+            raise ValueError("")
+        sycl_dev = exec_q.sycl_device
+        o1_dtype = o1.dtype
+        o2_dtype = _get_dtype(o2, sycl_dev)
+        if not _validate_dtype(o2_dtype):
+            raise ValueError("Operand has an unsupported data type")
+
+        o1_dtype, o2_dtype = self.weak_type_resolver_(
+            o1_dtype, o2_dtype, sycl_dev
+        )
+
+        buf_dt, res_dt = _find_buf_dtype_in_place_op(
+            o1_dtype,
+            o2_dtype,
+            self.result_type_resolver_fn_,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input types "
+                f"({o1_dtype}, {o2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''same_kind''."
+            )
+
+        if res_dt != o1_dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, " f"got {o1_dtype}"
+            )
+
+        _manager = SequentialOrderManager[exec_q]
+        if isinstance(o2, dpt.usm_ndarray):
+            src2 = o2
+            if (
+                ti._array_overlap(o2, o1)
+                and not ti._same_logical_tensors(o2, o1)
+                and buf_dt is None
+            ):
+                buf_dt = o2_dtype
+        else:
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+        if buf_dt is None:
+            if src2.shape != res_shape:
+                src2 = dpt.broadcast_to(src2, res_shape)
+            dep_evs = _manager.submitted_events
+            ht_, comp_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=src2,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_, comp_ev)
+        else:
+            buf = dpt.empty_like(src2, dtype=buf_dt)
+            dep_evs = _manager.submitted_events
+            (
+                ht_copy_ev,
+                copy_ev,
+            ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src2,
+                dst=buf,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+
+            buf = dpt.broadcast_to(buf, res_shape)
+            ht_, bf_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=buf,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_, bf_ev)
+
+        return o1
diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py
index 890af46339..5defd154df 100644
--- a/dpctl/tensor/_type_utils.py
+++ b/dpctl/tensor/_type_utils.py
@@ -277,6 +277,21 @@ def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn):
     return None, None, None
 
 
+def _find_buf_dtype_in_place_op(arg1_dtype, arg2_dtype, query_fn, sycl_dev):
+    res_dt = query_fn(arg1_dtype, arg2_dtype)
+    if res_dt:
+        return None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg2_dtype, arg1_dtype, _fp16, _fp64, casting="same_kind"):
+        res_dt = query_fn(arg1_dtype, arg1_dtype)
+        if res_dt:
+            return arg1_dtype, res_dt
+
+    return None, None
+
+
 class WeakBooleanType:
     "Python type representing type of Python boolean objects"
 
@@ -959,4 +974,5 @@ def _default_accumulation_dtype_fp_types(inp_dt, q):
     "WeakComplexType",
     "_default_accumulation_dtype",
     "_default_accumulation_dtype_fp_types",
+    "_find_buf_dtype_in_place_op",
 ]
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index a46f3f763f..e879424036 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -1508,43 +1508,43 @@ cdef class usm_ndarray:
         return dpctl.tensor.bitwise_xor(other, self)
 
     def __iadd__(self, other):
-        return dpctl.tensor.add(self, other, out=self)
+        return dpctl.tensor.add._inplace_op(self, other)
 
     def __iand__(self, other):
-        return dpctl.tensor.bitwise_and(self, other, out=self)
+        return dpctl.tensor.bitwise_and._inplace_op(self, other)
 
     def __ifloordiv__(self, other):
-        return dpctl.tensor.floor_divide(self, other, out=self)
+        return dpctl.tensor.floor_divide._inplace_op(self, other)
 
     def __ilshift__(self, other):
-        return dpctl.tensor.bitwise_left_shift(self, other, out=self)
+        return dpctl.tensor.bitwise_left_shift._inplace_op(self, other)
 
     def __imatmul__(self, other):
-        return dpctl.tensor.matmul(self, other, out=self)
+        return dpctl.tensor.matmul(self, other, out=self, dtype=self.dtype)
 
     def __imod__(self, other):
-        return dpctl.tensor.remainder(self, other, out=self)
+        return dpctl.tensor.remainder._inplace_op(self, other)
 
     def __imul__(self, other):
-        return dpctl.tensor.multiply(self, other, out=self)
+        return dpctl.tensor.multiply._inplace_op(self, other)
 
     def __ior__(self, other):
-        return dpctl.tensor.bitwise_or(self, other, out=self)
+        return dpctl.tensor.bitwise_or._inplace_op(self, other)
 
     def __ipow__(self, other):
-        return dpctl.tensor.pow(self, other, out=self)
+        return dpctl.tensor.pow._inplace_op(self, other)
 
     def __irshift__(self, other):
-        return dpctl.tensor.bitwise_right_shift(self, other, out=self)
+        return dpctl.tensor.bitwise_right_shift._inplace_op(self, other)
 
     def __isub__(self, other):
-        return dpctl.tensor.subtract(self, other, out=self)
+        return dpctl.tensor.subtract._inplace_op(self, other)
 
     def __itruediv__(self, other):
-        return dpctl.tensor.divide(self, other, out=self)
+        return dpctl.tensor.divide._inplace_op(self, other)
 
     def __ixor__(self, other):
-        return dpctl.tensor.bitwise_xor(self, other, out=self)
+        return dpctl.tensor.bitwise_xor._inplace_op(self, other)
 
     def __str__(self):
         return usm_ndarray_str(self)

From 79208c8dcedc78ca27d8289ec51d1dafcf4574a7 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 9 Sep 2024 23:48:36 -0700
Subject: [PATCH 02/10] Adjusts tests for in-place element-wise operations to
 account for `"same_kind"` casting

---
 dpctl/tests/elementwise/test_add.py                | 2 +-
 dpctl/tests/elementwise/test_bitwise_and.py        | 2 +-
 dpctl/tests/elementwise/test_bitwise_left_shift.py | 2 +-
 dpctl/tests/elementwise/test_bitwise_or.py         | 2 +-
 dpctl/tests/elementwise/test_bitwise_xor.py        | 2 +-
 dpctl/tests/elementwise/test_divide.py             | 4 ++--
 dpctl/tests/elementwise/test_floor_divide.py       | 2 +-
 dpctl/tests/elementwise/test_multiply.py           | 2 +-
 dpctl/tests/elementwise/test_pow.py                | 2 +-
 dpctl/tests/elementwise/test_remainder.py          | 2 +-
 dpctl/tests/elementwise/test_subtract.py           | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
index 9edc8399e8..c02a2b126a 100644
--- a/dpctl/tests/elementwise/test_add.py
+++ b/dpctl/tests/elementwise/test_add.py
@@ -358,7 +358,7 @@ def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 += ar2
         assert (
             dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
diff --git a/dpctl/tests/elementwise/test_bitwise_and.py b/dpctl/tests/elementwise/test_bitwise_and.py
index 767323eb6e..97d95e27f8 100644
--- a/dpctl/tests/elementwise/test_bitwise_and.py
+++ b/dpctl/tests/elementwise/test_bitwise_and.py
@@ -114,7 +114,7 @@ def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 &= ar2
         assert dpt.all(ar1 == 1)
 
diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py
index e2e3538176..0d1eab4575 100644
--- a/dpctl/tests/elementwise/test_bitwise_left_shift.py
+++ b/dpctl/tests/elementwise/test_bitwise_left_shift.py
@@ -122,7 +122,7 @@ def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 <<= ar2
         assert dpt.all(ar1 == 2)
 
diff --git a/dpctl/tests/elementwise/test_bitwise_or.py b/dpctl/tests/elementwise/test_bitwise_or.py
index 2cfa2af6f6..c854512436 100644
--- a/dpctl/tests/elementwise/test_bitwise_or.py
+++ b/dpctl/tests/elementwise/test_bitwise_or.py
@@ -114,7 +114,7 @@ def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 |= ar2
         assert dpt.all(ar1 == 1)
 
diff --git a/dpctl/tests/elementwise/test_bitwise_xor.py b/dpctl/tests/elementwise/test_bitwise_xor.py
index eca4da853f..d64bfa0186 100644
--- a/dpctl/tests/elementwise/test_bitwise_xor.py
+++ b/dpctl/tests/elementwise/test_bitwise_xor.py
@@ -114,7 +114,7 @@ def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 ^= ar2
         assert dpt.all(ar1 == 0)
 
diff --git a/dpctl/tests/elementwise/test_divide.py b/dpctl/tests/elementwise/test_divide.py
index d6b7d15201..16d73a040c 100644
--- a/dpctl/tests/elementwise/test_divide.py
+++ b/dpctl/tests/elementwise/test_divide.py
@@ -226,7 +226,7 @@ def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
     _fp64 = dev.has_aspect_fp64
     # out array only valid if it is inexact
     if (
-        _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64)
+        _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind")
         and dpt.dtype(op1_dtype).kind in "fc"
     ):
         ar1 /= ar2
@@ -276,7 +276,7 @@ def test_divide_gh_1711():
 
 
 # don't test for overflowing double as Python won't cast
-# an Python integer of that size to a Python float
+# a Python integer of that size to a Python float
 @pytest.mark.parametrize("fp_dt", [dpt.float16, dpt.float32])
 def test_divide_by_scalar_overflow(fp_dt):
     q = get_queue_or_skip()
diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py
index c2f3415642..e96c95d6cd 100644
--- a/dpctl/tests/elementwise/test_floor_divide.py
+++ b/dpctl/tests/elementwise/test_floor_divide.py
@@ -290,7 +290,7 @@ def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
     # out array only valid if it is inexact
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 //= ar2
         assert dpt.all(ar1 == 1)
 
diff --git a/dpctl/tests/elementwise/test_multiply.py b/dpctl/tests/elementwise/test_multiply.py
index e15bd367b0..82c4dcd04d 100644
--- a/dpctl/tests/elementwise/test_multiply.py
+++ b/dpctl/tests/elementwise/test_multiply.py
@@ -205,7 +205,7 @@ def test_multiply_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 *= ar2
         assert (
             dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
diff --git a/dpctl/tests/elementwise/test_pow.py b/dpctl/tests/elementwise/test_pow.py
index e298ed2347..0e8692df9a 100644
--- a/dpctl/tests/elementwise/test_pow.py
+++ b/dpctl/tests/elementwise/test_pow.py
@@ -183,7 +183,7 @@ def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 **= ar2
         assert (
             dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
diff --git a/dpctl/tests/elementwise/test_remainder.py b/dpctl/tests/elementwise/test_remainder.py
index ce1711074c..638384de1f 100644
--- a/dpctl/tests/elementwise/test_remainder.py
+++ b/dpctl/tests/elementwise/test_remainder.py
@@ -235,7 +235,7 @@ def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 %= ar2
         assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype))
 
diff --git a/dpctl/tests/elementwise/test_subtract.py b/dpctl/tests/elementwise/test_subtract.py
index 71647a7306..16c05a9dbe 100644
--- a/dpctl/tests/elementwise/test_subtract.py
+++ b/dpctl/tests/elementwise/test_subtract.py
@@ -208,7 +208,7 @@ def test_subtract_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 -= ar2
         assert (dpt.asnumpy(ar1) == np.zeros(ar1.shape, dtype=ar1.dtype)).all()
 

From f13c02d79e8323df2c57a43a57b84657ee395492 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 10 Sep 2024 10:52:55 -0700
Subject: [PATCH 03/10] `BinaryElementwiseFunc._inplace_op` now checks if a
 kernel is available

Raises `ValueError` otherwise
---
 dpctl/tensor/_elementwise_common.py | 232 ++++++++++++++--------------
 1 file changed, 120 insertions(+), 112 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 991c7ca303..8c3ea49c58 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -930,126 +930,134 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         return out
 
     def _inplace_op(self, o1, o2):
-        if not isinstance(o1, dpt.usm_ndarray):
-            raise TypeError(
-                "Expected first argument to be "
-                f"dpctl.tensor.usm_ndarray, got {type(o1)}"
-            )
-        if not o1.flags.writable:
-            raise ValueError("provided left-hand side array is read-only")
-        q1, o1_usm_type = o1.sycl_queue, o1.usm_type
-        q2, o2_usm_type = _get_queue_usm_type(o2)
-        if q2 is None:
-            exec_q = q1
-            res_usm_type = o1_usm_type
-        else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
+        if self.binary_inplace_fn_ is not None:
+            if not isinstance(o1, dpt.usm_ndarray):
+                raise TypeError(
+                    "Expected first argument to be "
+                    f"dpctl.tensor.usm_ndarray, got {type(o1)}"
                 )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    o1_usm_type,
-                    o2_usm_type,
+            if not o1.flags.writable:
+                raise ValueError("provided left-hand side array is read-only")
+            q1, o1_usm_type = o1.sycl_queue, o1.usm_type
+            q2, o2_usm_type = _get_queue_usm_type(o2)
+            if q2 is None:
+                exec_q = q1
+                res_usm_type = o1_usm_type
+            else:
+                exec_q = dpctl.utils.get_execution_queue((q1, q2))
+                if exec_q is None:
+                    raise ExecutionPlacementError(
+                        "Execution placement can not be unambiguously inferred "
+                        "from input arguments."
+                    )
+                res_usm_type = dpctl.utils.get_coerced_usm_type(
+                    (
+                        o1_usm_type,
+                        o2_usm_type,
+                    )
                 )
+            dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+            o1_shape = o1.shape
+            o2_shape = _get_shape(o2)
+            if not isinstance(o2_shape, (tuple, list)):
+                raise TypeError(
+                    "Shape of second argument can not be inferred. "
+                    "Expected list or tuple."
+                )
+            try:
+                res_shape = _broadcast_shape_impl(
+                    [
+                        o1_shape,
+                        o2_shape,
+                    ]
+                )
+            except ValueError:
+                raise ValueError(
+                    "operands could not be broadcast together with shapes "
+                    f"{o1_shape} and {o2_shape}"
+                )
+            if res_shape != o1_shape:
+                raise ValueError("")
+            sycl_dev = exec_q.sycl_device
+            o1_dtype = o1.dtype
+            o2_dtype = _get_dtype(o2, sycl_dev)
+            if not _validate_dtype(o2_dtype):
+                raise ValueError("Operand has an unsupported data type")
+
+            o1_dtype, o2_dtype = self.weak_type_resolver_(
+                o1_dtype, o2_dtype, sycl_dev
             )
-        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-        o1_shape = o1.shape
-        o2_shape = _get_shape(o2)
-        if not isinstance(o2_shape, (tuple, list)):
-            raise TypeError(
-                "Shape of second argument can not be inferred. "
-                "Expected list or tuple."
-            )
-        try:
-            res_shape = _broadcast_shape_impl(
-                [
-                    o1_shape,
-                    o2_shape,
-                ]
-            )
-        except ValueError:
-            raise ValueError(
-                "operands could not be broadcast together with shapes "
-                f"{o1_shape} and {o2_shape}"
+
+            buf_dt, res_dt = _find_buf_dtype_in_place_op(
+                o1_dtype,
+                o2_dtype,
+                self.result_type_resolver_fn_,
+                sycl_dev,
             )
-        if res_shape != o1_shape:
-            raise ValueError("")
-        sycl_dev = exec_q.sycl_device
-        o1_dtype = o1.dtype
-        o2_dtype = _get_dtype(o2, sycl_dev)
-        if not _validate_dtype(o2_dtype):
-            raise ValueError("Operand has an unsupported data type")
 
-        o1_dtype, o2_dtype = self.weak_type_resolver_(
-            o1_dtype, o2_dtype, sycl_dev
-        )
+            if res_dt is None:
+                raise ValueError(
+                    f"function '{self.name_}' does not support input types "
+                    f"({o1_dtype}, {o2_dtype}), "
+                    "and the inputs could not be safely coerced to any "
+                    "supported types according to the casting rule "
+                    "''same_kind''."
+                )
 
-        buf_dt, res_dt = _find_buf_dtype_in_place_op(
-            o1_dtype,
-            o2_dtype,
-            self.result_type_resolver_fn_,
-            sycl_dev,
-        )
+            if res_dt != o1_dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {o1_dtype}"
+                )
 
-        if res_dt is None:
-            raise ValueError(
-                f"function '{self.name_}' does not support input types "
-                f"({o1_dtype}, {o2_dtype}), "
-                "and the inputs could not be safely coerced to any "
-                "supported types according to the casting rule ''same_kind''."
-            )
+            _manager = SequentialOrderManager[exec_q]
+            if isinstance(o2, dpt.usm_ndarray):
+                src2 = o2
+                if (
+                    ti._array_overlap(o2, o1)
+                    and not ti._same_logical_tensors(o2, o1)
+                    and buf_dt is None
+                ):
+                    buf_dt = o2_dtype
+            else:
+                src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+            if buf_dt is None:
+                if src2.shape != res_shape:
+                    src2 = dpt.broadcast_to(src2, res_shape)
+                dep_evs = _manager.submitted_events
+                ht_, comp_ev = self.binary_inplace_fn_(
+                    lhs=o1,
+                    rhs=src2,
+                    sycl_queue=exec_q,
+                    depends=dep_evs,
+                )
+                _manager.add_event_pair(ht_, comp_ev)
+            else:
+                buf = dpt.empty_like(src2, dtype=buf_dt)
+                dep_evs = _manager.submitted_events
+                (
+                    ht_copy_ev,
+                    copy_ev,
+                ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=src2,
+                    dst=buf,
+                    sycl_queue=exec_q,
+                    depends=dep_evs,
+                )
+                _manager.add_event_pair(ht_copy_ev, copy_ev)
 
-        if res_dt != o1_dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, " f"got {o1_dtype}"
-            )
+                buf = dpt.broadcast_to(buf, res_shape)
+                ht_, bf_ev = self.binary_inplace_fn_(
+                    lhs=o1,
+                    rhs=buf,
+                    sycl_queue=exec_q,
+                    depends=[copy_ev],
+                )
+                _manager.add_event_pair(ht_, bf_ev)
 
-        _manager = SequentialOrderManager[exec_q]
-        if isinstance(o2, dpt.usm_ndarray):
-            src2 = o2
-            if (
-                ti._array_overlap(o2, o1)
-                and not ti._same_logical_tensors(o2, o1)
-                and buf_dt is None
-            ):
-                buf_dt = o2_dtype
-        else:
-            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
-        if buf_dt is None:
-            if src2.shape != res_shape:
-                src2 = dpt.broadcast_to(src2, res_shape)
-            dep_evs = _manager.submitted_events
-            ht_, comp_ev = self.binary_inplace_fn_(
-                lhs=o1,
-                rhs=src2,
-                sycl_queue=exec_q,
-                depends=dep_evs,
-            )
-            _manager.add_event_pair(ht_, comp_ev)
+            return o1
         else:
-            buf = dpt.empty_like(src2, dtype=buf_dt)
-            dep_evs = _manager.submitted_events
-            (
-                ht_copy_ev,
-                copy_ev,
-            ) = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=src2,
-                dst=buf,
-                sycl_queue=exec_q,
-                depends=dep_evs,
-            )
-            _manager.add_event_pair(ht_copy_ev, copy_ev)
-
-            buf = dpt.broadcast_to(buf, res_shape)
-            ht_, bf_ev = self.binary_inplace_fn_(
-                lhs=o1,
-                rhs=buf,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
+            raise ValueError(
+                "binary function does not have a dedicated in-place "
+                "implementation"
             )
-            _manager.add_event_pair(ht_, bf_ev)
-
-        return o1

From d7fba40375d68f8d89a2e375e95e256960fbd35e Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 10 Sep 2024 11:11:22 -0700
Subject: [PATCH 04/10] Adds message to `ValueError` for cast of binary
 in-place operator where both inputs are broadcast to a new shape

---
 dpctl/tensor/_elementwise_common.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 8c3ea49c58..87f4959de2 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -976,8 +976,14 @@ def _inplace_op(self, o1, o2):
                     "operands could not be broadcast together with shapes "
                     f"{o1_shape} and {o2_shape}"
                 )
+
             if res_shape != o1_shape:
-                raise ValueError("")
+                raise ValueError(
+                    "The shape of the non-broadcastable left-hand "
+                    f"side {o1_shape} is inconsistent with the "
+                    f"broadcast shape {res_shape}."
+                )
+
             sycl_dev = exec_q.sycl_device
             o1_dtype = o1.dtype
             o2_dtype = _get_dtype(o2, sycl_dev)

From 7f59819c777da7b882306cd3f41cbf5bbe772aa4 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 10 Sep 2024 15:11:53 -0700
Subject: [PATCH 05/10] Add more tests for element-wise in-place operators

Also clean up and make some tests for in-place operators more efficient
---
 dpctl/tests/elementwise/test_add.py           | 68 +++++++++++++++++--
 dpctl/tests/elementwise/test_bitwise_and.py   | 16 -----
 .../elementwise/test_bitwise_left_shift.py    | 16 -----
 .../elementwise/test_elementwise_classes.py   | 11 ++-
 dpctl/tests/elementwise/test_floor_divide.py  | 15 ----
 5 files changed, 71 insertions(+), 55 deletions(-)

diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
index c02a2b126a..e7838005b0 100644
--- a/dpctl/tests/elementwise/test_add.py
+++ b/dpctl/tests/elementwise/test_add.py
@@ -373,9 +373,25 @@ def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
     else:
         with pytest.raises(ValueError):
             ar1 += ar2
+
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        dpt.add(ar1, ar2, out=ar1)
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        dpt.add(ar3, ar4, out=ar3)
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
+        ).all()
+    else:
+        with pytest.raises(ValueError):
             dpt.add(ar1, ar2, out=ar1)
 
-    # out is second arg
     ar1 = dpt.ones(sz, dtype=op1_dtype)
     ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
     if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
@@ -401,7 +417,7 @@ def test_add_inplace_broadcasting():
     m = dpt.ones((100, 5), dtype="i4")
     v = dpt.arange(5, dtype="i4")
 
-    m += v
+    dpt.add(m, v, out=m)
     assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
 
     # check case where second arg is out
@@ -411,6 +427,26 @@ def test_add_inplace_broadcasting():
     ).all()
 
 
+def test_add_inplace_operator_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    m += v
+    assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+
+def test_add_inplace_operator_mutual_broadcast():
+    get_queue_or_skip()
+
+    x1 = dpt.ones((1, 10), dtype="i4")
+    x2 = dpt.ones((10, 1), dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.add._inplace_op(x1, x2)
+
+
 def test_add_inplace_errors():
     get_queue_or_skip()
     try:
@@ -425,27 +461,45 @@ def test_add_inplace_errors():
     ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
     ar2 = dpt.ones_like(ar1, sycl_queue=cpu_queue)
     with pytest.raises(ExecutionPlacementError):
-        ar1 += ar2
+        dpt.add(ar1, ar2, out=ar1)
 
     ar1 = dpt.ones(2, dtype="float32")
     ar2 = dpt.ones(3, dtype="float32")
     with pytest.raises(ValueError):
-        ar1 += ar2
+        dpt.add(ar1, ar2, out=ar1)
 
     ar1 = np.ones(2, dtype="float32")
     ar2 = dpt.ones(2, dtype="float32")
     with pytest.raises(TypeError):
-        ar1 += ar2
+        dpt.add(ar1, ar2, out=ar1)
 
     ar1 = dpt.ones(2, dtype="float32")
     ar2 = dict()
     with pytest.raises(ValueError):
-        ar1 += ar2
+        dpt.add(ar1, ar2, out=ar1)
 
     ar1 = dpt.ones((2, 1), dtype="float32")
     ar2 = dpt.ones((1, 2), dtype="float32")
     with pytest.raises(ValueError):
-        ar1 += ar2
+        dpt.add(ar1, ar2, out=ar1)
+
+
+def test_add_inplace_operator_errors():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    with pytest.raises(TypeError):
+        dpt.add._inplace_op(dict(), x)
+
+    x.flags["W"] = False
+    with pytest.raises(ValueError):
+        dpt.add._inplace_op(x, 2)
+
+    x_q1 = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    x_q2 = dpt.ones(10, dtype="i4", sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.add._inplace_op(x_q1, x_q2)
 
 
 def test_add_inplace_same_tensors():
diff --git a/dpctl/tests/elementwise/test_bitwise_and.py b/dpctl/tests/elementwise/test_bitwise_and.py
index 97d95e27f8..f90e03ea29 100644
--- a/dpctl/tests/elementwise/test_bitwise_and.py
+++ b/dpctl/tests/elementwise/test_bitwise_and.py
@@ -125,19 +125,3 @@ def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype):
     else:
         with pytest.raises(ValueError):
             ar1 &= ar2
-            dpt.bitwise_and(ar1, ar2, out=ar1)
-
-    # out is second arg
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
-        dpt.bitwise_and(ar1, ar2, out=ar2)
-        assert dpt.all(ar2 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        dpt.bitwise_and(ar3, ar4, out=ar4)
-        dpt.all(ar4 == 1)
-    else:
-        with pytest.raises(ValueError):
-            dpt.bitwise_and(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py
index 0d1eab4575..bd7aefe5af 100644
--- a/dpctl/tests/elementwise/test_bitwise_left_shift.py
+++ b/dpctl/tests/elementwise/test_bitwise_left_shift.py
@@ -133,19 +133,3 @@ def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
     else:
         with pytest.raises(ValueError):
             ar1 <<= ar2
-            dpt.bitwise_left_shift(ar1, ar2, out=ar1)
-
-    # out is second arg
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
-        dpt.bitwise_left_shift(ar1, ar2, out=ar2)
-        assert dpt.all(ar2 == 2)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        dpt.bitwise_left_shift(ar3, ar4, out=ar4)
-        dpt.all(ar4 == 2)
-    else:
-        with pytest.raises(ValueError):
-            dpt.bitwise_left_shift(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_elementwise_classes.py b/dpctl/tests/elementwise/test_elementwise_classes.py
index ac6af2d417..fb220b811a 100644
--- a/dpctl/tests/elementwise/test_elementwise_classes.py
+++ b/dpctl/tests/elementwise/test_elementwise_classes.py
@@ -118,7 +118,7 @@ def test_binary_class_nout():
     assert nout == 1
 
 
-def test_biary_read_only_out():
+def test_binary_read_only_out():
     get_queue_or_skip()
     x1 = dpt.ones(32, dtype=dpt.float32)
     x2 = dpt.ones_like(x1)
@@ -126,3 +126,12 @@ def test_biary_read_only_out():
     r.flags["W"] = False
     with pytest.raises(ValueError):
         binary_fn(x1, x2, out=r)
+
+
+def test_binary_no_inplace_op():
+    get_queue_or_skip()
+    x1 = dpt.ones(10, dtype="i4")
+    x2 = dpt.ones_like(x1)
+
+    with pytest.raises(ValueError):
+        dpt.logaddexp._inplace_op(x1, x2)
diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py
index e96c95d6cd..068a42f338 100644
--- a/dpctl/tests/elementwise/test_floor_divide.py
+++ b/dpctl/tests/elementwise/test_floor_divide.py
@@ -302,18 +302,3 @@ def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
         with pytest.raises(ValueError):
             ar1 //= ar2
             dpt.floor_divide(ar1, ar2, out=ar1)
-
-    # out is second arg
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
-        dpt.floor_divide(ar1, ar2, out=ar2)
-        assert dpt.all(ar2 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        dpt.floor_divide(ar3, ar4, out=ar4)
-        dpt.all(ar4 == 1)
-    else:
-        with pytest.raises(ValueError):
-            dpt.floor_divide(ar1, ar2, out=ar2)

From 9529f0a8e8ba47fbf682d49559f54d41969b14cb Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 11 Sep 2024 08:01:37 -0700
Subject: [PATCH 06/10] Change per PR review to exception in
 UnaryElementwiseFunc

---
 dpctl/tensor/_elementwise_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 87f4959de2..c6372e6e5e 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -215,7 +215,7 @@ def __call__(self, x, /, *, out=None, order="K"):
             if res_dt != out.dtype:
                 raise ValueError(
                     f"Output array of type {res_dt} is needed, "
-                    f" got {out.dtype}"
+                    f"got {out.dtype}"
                 )
 
             if (

From 54dd6b622db0afea954162646f6017df003bd88f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 11 Sep 2024 11:37:28 -0700
Subject: [PATCH 07/10] Change per PR review by @oleksandr-pavlyk

---
 dpctl/tensor/_elementwise_common.py | 248 ++++++++++++++--------------
 1 file changed, 123 insertions(+), 125 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index c6372e6e5e..5b26eb9225 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -930,140 +930,138 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         return out
 
     def _inplace_op(self, o1, o2):
-        if self.binary_inplace_fn_ is not None:
-            if not isinstance(o1, dpt.usm_ndarray):
-                raise TypeError(
-                    "Expected first argument to be "
-                    f"dpctl.tensor.usm_ndarray, got {type(o1)}"
-                )
-            if not o1.flags.writable:
-                raise ValueError("provided left-hand side array is read-only")
-            q1, o1_usm_type = o1.sycl_queue, o1.usm_type
-            q2, o2_usm_type = _get_queue_usm_type(o2)
-            if q2 is None:
-                exec_q = q1
-                res_usm_type = o1_usm_type
-            else:
-                exec_q = dpctl.utils.get_execution_queue((q1, q2))
-                if exec_q is None:
-                    raise ExecutionPlacementError(
-                        "Execution placement can not be unambiguously inferred "
-                        "from input arguments."
-                    )
-                res_usm_type = dpctl.utils.get_coerced_usm_type(
-                    (
-                        o1_usm_type,
-                        o2_usm_type,
-                    )
-                )
-            dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-            o1_shape = o1.shape
-            o2_shape = _get_shape(o2)
-            if not isinstance(o2_shape, (tuple, list)):
-                raise TypeError(
-                    "Shape of second argument can not be inferred. "
-                    "Expected list or tuple."
-                )
-            try:
-                res_shape = _broadcast_shape_impl(
-                    [
-                        o1_shape,
-                        o2_shape,
-                    ]
-                )
-            except ValueError:
-                raise ValueError(
-                    "operands could not be broadcast together with shapes "
-                    f"{o1_shape} and {o2_shape}"
+        if self.binary_inplace_fn_ is None:
+            raise ValueError(
+                "binary function does not have a dedicated in-place "
+                "implementation"
+            )
+        if not isinstance(o1, dpt.usm_ndarray):
+            raise TypeError(
+                "Expected first argument to be "
+                f"dpctl.tensor.usm_ndarray, got {type(o1)}"
+            )
+        if not o1.flags.writable:
+            raise ValueError("provided left-hand side array is read-only")
+        q1, o1_usm_type = o1.sycl_queue, o1.usm_type
+        q2, o2_usm_type = _get_queue_usm_type(o2)
+        if q2 is None:
+            exec_q = q1
+            res_usm_type = o1_usm_type
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
                 )
-
-            if res_shape != o1_shape:
-                raise ValueError(
-                    "The shape of the non-broadcastable left-hand "
-                    f"side {o1_shape} is inconsistent with the "
-                    f"broadcast shape {res_shape}."
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    o1_usm_type,
+                    o2_usm_type,
                 )
-
-            sycl_dev = exec_q.sycl_device
-            o1_dtype = o1.dtype
-            o2_dtype = _get_dtype(o2, sycl_dev)
-            if not _validate_dtype(o2_dtype):
-                raise ValueError("Operand has an unsupported data type")
-
-            o1_dtype, o2_dtype = self.weak_type_resolver_(
-                o1_dtype, o2_dtype, sycl_dev
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        o1_shape = o1.shape
+        o2_shape = _get_shape(o2)
+        if not isinstance(o2_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of second argument can not be inferred. "
+                "Expected list or tuple."
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    o1_shape,
+                    o2_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{o1_shape} and {o2_shape}"
             )
 
-            buf_dt, res_dt = _find_buf_dtype_in_place_op(
-                o1_dtype,
-                o2_dtype,
-                self.result_type_resolver_fn_,
-                sycl_dev,
+        if res_shape != o1_shape:
+            raise ValueError(
+                "The shape of the non-broadcastable left-hand "
+                f"side {o1_shape} is inconsistent with the "
+                f"broadcast shape {res_shape}."
             )
 
-            if res_dt is None:
-                raise ValueError(
-                    f"function '{self.name_}' does not support input types "
-                    f"({o1_dtype}, {o2_dtype}), "
-                    "and the inputs could not be safely coerced to any "
-                    "supported types according to the casting rule "
-                    "''same_kind''."
-                )
+        sycl_dev = exec_q.sycl_device
+        o1_dtype = o1.dtype
+        o2_dtype = _get_dtype(o2, sycl_dev)
+        if not _validate_dtype(o2_dtype):
+            raise ValueError("Operand has an unsupported data type")
 
-            if res_dt != o1_dtype:
-                raise ValueError(
-                    f"Output array of type {res_dt} is needed, "
-                    f"got {o1_dtype}"
-                )
+        o1_dtype, o2_dtype = self.weak_type_resolver_(
+            o1_dtype, o2_dtype, sycl_dev
+        )
 
-            _manager = SequentialOrderManager[exec_q]
-            if isinstance(o2, dpt.usm_ndarray):
-                src2 = o2
-                if (
-                    ti._array_overlap(o2, o1)
-                    and not ti._same_logical_tensors(o2, o1)
-                    and buf_dt is None
-                ):
-                    buf_dt = o2_dtype
-            else:
-                src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
-            if buf_dt is None:
-                if src2.shape != res_shape:
-                    src2 = dpt.broadcast_to(src2, res_shape)
-                dep_evs = _manager.submitted_events
-                ht_, comp_ev = self.binary_inplace_fn_(
-                    lhs=o1,
-                    rhs=src2,
-                    sycl_queue=exec_q,
-                    depends=dep_evs,
-                )
-                _manager.add_event_pair(ht_, comp_ev)
-            else:
-                buf = dpt.empty_like(src2, dtype=buf_dt)
-                dep_evs = _manager.submitted_events
-                (
-                    ht_copy_ev,
-                    copy_ev,
-                ) = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=src2,
-                    dst=buf,
-                    sycl_queue=exec_q,
-                    depends=dep_evs,
-                )
-                _manager.add_event_pair(ht_copy_ev, copy_ev)
+        buf_dt, res_dt = _find_buf_dtype_in_place_op(
+            o1_dtype,
+            o2_dtype,
+            self.result_type_resolver_fn_,
+            sycl_dev,
+        )
 
-                buf = dpt.broadcast_to(buf, res_shape)
-                ht_, bf_ev = self.binary_inplace_fn_(
-                    lhs=o1,
-                    rhs=buf,
-                    sycl_queue=exec_q,
-                    depends=[copy_ev],
-                )
-                _manager.add_event_pair(ht_, bf_ev)
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input types "
+                f"({o1_dtype}, {o2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule "
+                "''same_kind''."
+            )
 
-            return o1
-        else:
+        if res_dt != o1_dtype:
             raise ValueError(
-                "binary function does not have a dedicated in-place "
-                "implementation"
+                f"Output array of type {res_dt} is needed, " f"got {o1_dtype}"
             )
+
+        _manager = SequentialOrderManager[exec_q]
+        if isinstance(o2, dpt.usm_ndarray):
+            src2 = o2
+            if (
+                ti._array_overlap(o2, o1)
+                and not ti._same_logical_tensors(o2, o1)
+                and buf_dt is None
+            ):
+                buf_dt = o2_dtype
+        else:
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+        if buf_dt is None:
+            if src2.shape != res_shape:
+                src2 = dpt.broadcast_to(src2, res_shape)
+            dep_evs = _manager.submitted_events
+            ht_, comp_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=src2,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_, comp_ev)
+        else:
+            buf = dpt.empty_like(src2, dtype=buf_dt)
+            dep_evs = _manager.submitted_events
+            (
+                ht_copy_ev,
+                copy_ev,
+            ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src2,
+                dst=buf,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+
+            buf = dpt.broadcast_to(buf, res_shape)
+            ht_, bf_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=buf,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_, bf_ev)
+
+        return o1

From 0297fbe0d558c8fbab2ad1b57835806e37120c64 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 11 Sep 2024 12:19:49 -0700
Subject: [PATCH 08/10] Add changes to in-place element-wise operators to
 changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb7f95bde5..62efdb0ac5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+* `"same_kind"`-style casting for `tensor.usm_ndarray` in-place mathematical operators[gh-1827](https://github.com/IntelPython/dpctl/pull/1827)
+
 ### Change
 
 ### Fixed

From 7ba5c310363a5f749eba4cd8a66bd09ccc55da9d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 11 Sep 2024 12:38:45 -0700
Subject: [PATCH 09/10] Add comments explaining logic behind various stages of
 `test_add_inplace_dtype_matrix`

---
 dpctl/tests/elementwise/test_add.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
index e7838005b0..e331df6520 100644
--- a/dpctl/tests/elementwise/test_add.py
+++ b/dpctl/tests/elementwise/test_add.py
@@ -358,6 +358,8 @@ def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
     dev = q.sycl_device
     _fp16 = dev.has_aspect_fp16
     _fp64 = dev.has_aspect_fp64
+    # operators use a different Python implementation which permits
+    # same kind style casting
     if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
         ar1 += ar2
         assert (
@@ -374,6 +376,9 @@ def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
         with pytest.raises(ValueError):
             ar1 += ar2
 
+    # here, test the special case where out is the first argument
+    # so an in-place kernel is used for efficiency
+    # this covers a specific branch in the BinaryElementwiseFunc logic
     ar1 = dpt.ones(sz, dtype=op1_dtype)
     ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
     if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):

From 7599b37f45e000af4286c9844f770d3e6ebe5e15 Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Wed, 11 Sep 2024 12:49:07 -0700
Subject: [PATCH 10/10] Fix typo in changelog entry

Co-authored-by: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 62efdb0ac5..9eb4bbc220 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-* `"same_kind"`-style casting for `tensor.usm_ndarray` in-place mathematical operators[gh-1827](https://github.com/IntelPython/dpctl/pull/1827)
+* `"same_kind"`-style casting for `tensor.usm_ndarray` in-place mathematical operators [gh-1827](https://github.com/IntelPython/dpctl/pull/1827)
 
 ### Change