address comments - first round

vtavana · vtavana · commit 7b8d29b14115 · 2023-12-12T13:36:02.000-06:00
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -38,8 +38,8 @@ namespace py = pybind11;
 // populate dispatch tables
 void init_dispatch_tables(void)
 {
-    blas_ext::init_gemm_dispatch_table();
     blas_ext::init_gemm_batch_dispatch_table();
+    blas_ext::init_gemm_dispatch_table();
 }
 
 PYBIND11_MODULE(_blas_impl, m)
@@ -51,12 +51,18 @@ PYBIND11_MODULE(_blas_impl, m)
               "Call `gemm` from OneMKL LAPACK library to return "
               "the matrix-matrix product with 2-D matrices.",
               py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"),
-              py::arg("matrixC"), py::arg("depends") = py::list());
+              py::arg("result"), py::arg("depends") = py::list());
     }
 
     {
         m.def("_gemm_batch", &blas_ext::gemm_batch,
               "Call `gemm_batch` from OneMKL LAPACK library to return "
-              "the matrix-matrix product with general matrices.");
+              "the matrix-matrix product with general matrices.",
+              py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"),
+              py::arg("result"), py::arg("m"), py::arg("n"), py::arg("k"),
+              py::arg("batch_size"), py::arg("ld_array_1"),
+              py::arg("ld_array_2"), py::arg("ld_result"), py::arg("stridea"),
+              py::arg("strideb"), py::arg("stridec"), py::arg("transA_int"),
+              py::arg("transB_int"), py::arg("depends") = py::list());
     }
 }
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
@@ -93,9 +93,9 @@ static sycl::event gemm_impl(sycl::queue exec_q,
     try {
         gemm_event = mkl_blas::row_major::gemm(
             exec_q,
-            transA, // Parameter indicating whether matrix A is not
-                    // transposed ('N'), transposed ('T'),
-                    // or conjugate transposed ('C').
+            transA, // Defines the transpose operation for matrix A:
+                    // 'N' indicates no transpose, 'T' for transpose,
+                    // or 'C' for a conjugate transpose.
             transB, // Same as transA but for matrix B.
             m,      // Number of rows in matrices A and C.
             n,      // Number of columns in matrices B and C.
@@ -106,7 +106,7 @@ static sycl::event gemm_impl(sycl::queue exec_q,
                     // stride between successive rows (for row major
                     // layout).
             b,      // Pointer to matrix B.
-            ldb,    // Leading dimension of matrix B, similar to lda
+            ldb,    // Leading dimension of matrix B, similar to lda.
             Tab(0), // Scaling factor for matrix C.
             res,    // Pointer to matrix C, where the result is stored.
             ldc,    // Leading dimension of matrix C.
@@ -198,7 +198,8 @@ std::pair<sycl::event, sycl::event>
     gemm_impl_fn_ptr_t gemm_fn =
         gemm_dispatch_table[matrixAB_type_id][resultC_type_id];
     if (gemm_fn == nullptr) {
-        throw py::value_error("Type dispatch ran into trouble.");
+        throw py::value_error(
+            "Types of input matrices and result matrix are mismatched.");
     }
 
     char *a_typeless_ptr = matrixA.get_data();
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
@@ -45,9 +45,8 @@ extern std::pair<sycl::event, sycl::event>
          dpctl::tensor::usm_ndarray resultC,
          const std::vector<sycl::event> &depends);
 
-// extern sycl::event
 extern std::pair<sycl::event, sycl::event>
-    gemm_batch(sycl::queue q,
+    gemm_batch(sycl::queue exec_q,
                dpctl::tensor::usm_ndarray matrixA,
                dpctl::tensor::usm_ndarray matrixB,
                dpctl::tensor::usm_ndarray resultC,
diff --git a/dpnp/backend/extensions/blas/gemm_batch.cpp b/dpnp/backend/extensions/blas/gemm_batch.cpp
@@ -100,7 +100,7 @@ static sycl::event gemm_batch_impl(sycl::queue exec_q,
 
     sycl::event gemm_batch_event;
     try {
-        gemm_batch_event = oneapi::mkl::blas::row_major::gemm_batch(
+        gemm_batch_event = mkl_blas::row_major::gemm_batch(
             exec_q, transA, transB, m, n, k, Tab(1), a, ld_array_1, stridea, b,
             ld_array_2, strideb, Tab(0), res, ld_result, stridec, batch_size,
             depends);
@@ -171,7 +171,8 @@ std::pair<sycl::event, sycl::event>
     gemm_batch_impl_fn_ptr_t gemm_batch_fn =
         gemm_batch_dispatch_table[matrixAB_type_id][resultC_type_id];
     if (gemm_batch_fn == nullptr) {
-        throw py::value_error("Type dispatch ran into trouble.");
+        throw py::value_error(
+            "Types of input matrices and result matrix are mismatched.");
     }
 
     char *a_typeless_ptr = matrixA.get_data();
diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp
@@ -73,6 +73,14 @@ struct GemmTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in
+ * oneapi::mkl::blas::gemm_batch<Tab, Tc> function.
+ *
+ * @tparam Tab Type of arrays containing input matrices A and B.
+ * @tparam Tc Type of array containing output matrix C.
+ */
 template <typename Tab, typename Tc>
 struct GemmBatchTypePairSupportFactory
 {
diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
@@ -285,25 +285,26 @@ def matmul(
 
     Examples
     --------
-    >>> import dpnp as np
-    >>> a = np.ones([9, 5, 7, 4])
-    >>> c = np.ones([9, 5, 4, 3])
-    >>> np.matmul(a, c).shape
-    (9, 5, 7, 3)
+    For 2-D arrays it is the matrix product:
 
+    >>> import dpnp as np
     >>> a = np.array([[1, 0], [0, 1]])
     >>> b = np.array([[4, 1], [2, 2]])
     >>> np.matmul(a, b)
     array([[4, 1],
            [2, 2]])
 
+    For 2-D mixed with 1-D, the result is the usual.
+
     >>> a = np.array([[1, 0], [0, 1]])
     >>> b = np.array([1, 2])
     >>> np.matmul(a, b)
     array([1, 2])
     >>> np.matmul(b, a)
     array([1, 2])
 
+    Broadcasting is conventional for stacks of arrays
+
     >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))
     >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))
     >>> np.matmul(a,b).shape
@@ -313,11 +314,16 @@ def matmul(
     >>> np.sum(a[0, 1, :] * b[0 , :, 1])
     array(98)
 
-    The ``@`` operator can be used as a shorthand for ``matmul`` on
-    :class:`dpnp.ndarray`.
+    Vector, vector returns the scalar inner product, but neither argument is complex-conjugated:
 
     >>> x1 = np.array([2j, 3j])
     >>> x2 = np.array([2j, 3j])
+    >>> np.matmul(x1, x2)
+    array(-13+0j)
+
+    The ``@`` operator can be used as a shorthand for ``matmul`` on
+    :class:`dpnp.ndarray`.
+
     >>> x1 @ x2
     array(-13+0j)
 
@@ -590,27 +596,52 @@ def dpnp_matmul_batch(
 
 
 def _gemm_res_dtype(*arrays, casting):
-    dtype = dpnp.result_type(*arrays)
-    default = dpnp.default_float_type(device=arrays[0].device)
-    if dpnp.issubdtype(dtype, dpnp.complexfloating):
-        default = dpnp.complex64 if default == dpnp.float32 else dpnp.complex128
+    """
+    Determines the data types for matmul operation and the output array of matmul operation.
+
+    The output array data type is determined based on the Promotion Type Rule
+    and device capibilities. The data type used in matmul operation is an 'inexact' data type
+    determined based on the output data type and device capabilities.
+    Both data types are determined based on the fact that the output array data type can be cast
+    to the other data type according to casting rule specified, otherwise a ``TypeError`` is raised.
+
+    Parameters
+    ----------
+    arrays : {dpnp_array, usm_ndarray}
+        Input arrays.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur.
+
+    Returns
+    -------
+    gemm_dtype, res_dtype :
+        The appropriate data types for performing matmul operation and presenting output array.
+
+    """
+
+    res_dtype = dpnp.result_type(*arrays)
+    gemm_dtype = dpnp.default_float_type(device=arrays[0].device)
+    if dpnp.issubdtype(res_dtype, dpnp.complexfloating):
+        gemm_dtype = (
+            dpnp.complex64 if gemm_dtype == dpnp.float32 else dpnp.complex128
+        )
 
-    if dpnp.can_cast(dtype, default, casting):
-        if dtype in [
+    if dpnp.can_cast(res_dtype, gemm_dtype, casting):
+        if res_dtype in [
             dpnp.float64,
             dpnp.complex128,
-        ]:  # in case device does not support fp64 (default)
-            return default, default
-        elif dtype in [
+        ]:  # in case device does not support fp64
+            return gemm_dtype, gemm_dtype
+        elif res_dtype in [
             dpnp.float32,
             dpnp.complex64,
-        ]:  # needed dtype is fp32 but device supports fp64 (default)
-            return dtype, dtype
+        ]:  # needed dtype is fp32 but device supports fp64
+            return res_dtype, res_dtype
         else:
-            return default, dtype
+            return gemm_dtype, res_dtype
     else:
         raise TypeError(
-            f"Cannot cast ufunc 'matmul' output from dtype({dtype}) to dtype({default}) with casting rule {casting}"
+            f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({gemm_dtype}) with casting rule {casting}"
         )
 
 
diff --git a/tests/third_party/cupy/math_tests/test_matmul.py b/tests/third_party/cupy/math_tests/test_matmul.py
@@ -57,7 +57,6 @@
         }
     )
 )
-@testing.gpu
 class TestMatmul(unittest.TestCase):
     @testing.for_all_dtypes(name="dtype1")
     @testing.numpy_cupy_allclose(rtol=1e-3, atol=1e-3)  # required for uint8
@@ -93,7 +92,6 @@ def test_cupy_matmul(self, xp, dtype1):
         }
     )
 )
-@testing.gpu
 class TestMatmulLarge(unittest.TestCase):
     # Avoid overflow
     skip_dtypes = {
@@ -149,7 +147,6 @@ def test_cupy_matmul(self, xp, dtype1):
         }
     )
 )
-@testing.gpu
 class TestMatmulInvalidShape(unittest.TestCase):
     def test_invalid_shape(self):
         for xp in (numpy, dpnp):

Original file line number	Diff line number	Diff line change
`@@ -38,8 +38,8 @@ namespace py = pybind11;`
`38`	`38`	`// populate dispatch tables`
`39`	`39`	`void init_dispatch_tables(void)`
`40`	`40`	`{`
`41`		`- blas_ext::init_gemm_dispatch_table();`
`42`	`41`	`blas_ext::init_gemm_batch_dispatch_table();`
	`42`	`+ blas_ext::init_gemm_dispatch_table();`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`PYBIND11_MODULE(_blas_impl, m)`
`@@ -51,12 +51,18 @@ PYBIND11_MODULE(_blas_impl, m)`
`51`	`51`	"Call `gemm` from OneMKL LAPACK library to return "
`52`	`52`	`"the matrix-matrix product with 2-D matrices.",`
`53`	`53`	`py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"),`
`54`		`- py::arg("matrixC"), py::arg("depends") = py::list());`
	`54`	`+ py::arg("result"), py::arg("depends") = py::list());`
`55`	`55`	`}`
`56`	`56`
`57`	`57`	`{`
`58`	`58`	`m.def("_gemm_batch", &blas_ext::gemm_batch,`
`59`	`59`	"Call `gemm_batch` from OneMKL LAPACK library to return "
`60`		`- "the matrix-matrix product with general matrices.");`
	`60`	`+ "the matrix-matrix product with general matrices.",`
	`61`	`+ py::arg("sycl_queue"), py::arg("matrixA"), py::arg("matrixB"),`
	`62`	`+ py::arg("result"), py::arg("m"), py::arg("n"), py::arg("k"),`
	`63`	`+ py::arg("batch_size"), py::arg("ld_array_1"),`
	`64`	`+ py::arg("ld_array_2"), py::arg("ld_result"), py::arg("stridea"),`
	`65`	`+ py::arg("strideb"), py::arg("stridec"), py::arg("transA_int"),`
	`66`	`+ py::arg("transB_int"), py::arg("depends") = py::list());`
`61`	`67`	`}`
`62`	`68`	`}`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,6 @@`
`57`	`57`	`}`
`58`	`58`	`)`
`59`	`59`	`)`
`60`		`-@testing.gpu`
`61`	`60`	`class TestMatmul(unittest.TestCase):`
`62`	`61`	`@testing.for_all_dtypes(name="dtype1")`
`63`	`62`	`@testing.numpy_cupy_allclose(rtol=1e-3, atol=1e-3) # required for uint8`
`@@ -93,7 +92,6 @@ def test_cupy_matmul(self, xp, dtype1):`
`93`	`92`	`}`
`94`	`93`	`)`
`95`	`94`	`)`
`96`		`-@testing.gpu`
`97`	`95`	`class TestMatmulLarge(unittest.TestCase):`
`98`	`96`	`# Avoid overflow`
`99`	`97`	`skip_dtypes = {`
`@@ -149,7 +147,6 @@ def test_cupy_matmul(self, xp, dtype1):`
`149`	`147`	`}`
`150`	`148`	`)`
`151`	`149`	`)`
`152`		`-@testing.gpu`
`153`	`150`	`class TestMatmulInvalidShape(unittest.TestCase):`
`154`	`151`	`def test_invalid_shape(self):`
`155`	`152`	`for xp in (numpy, dpnp):`