IntelPython
diff --git a/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst
Lines changed: 1 addition & 0 deletions b/‎docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎dpctl/tensor/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎dpctl/tensor/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎dpctl/tensor/__init__.py
Lines changed: 2 additions & 1 deletion b/‎dpctl/tensor/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎dpctl/tensor/_sorting.py
Lines changed: 167 additions & 0 deletions b/‎dpctl/tensor/_sorting.py
Lines changed: 167 additions & 0 deletions
diff --git a/‎dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
Lines changed: 9 additions & 26 deletions b/‎dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
Lines changed: 9 additions & 26 deletions
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+* Added `dpctl.tensor.top_k` per Python Array API specification: [#1921](https://github.com/IntelPython/dpctl/pull/1921)
+
 ### Changed
 
 * Improved performance of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829)
 
@@ -10,3 +10,4 @@ Sorting functions
 
    argsort
    sort
+   top_k
@@ -115,6 +115,7 @@ set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
 )
 set(_sorting_radix_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
 
@@ -199,7 +199,7 @@
     unique_inverse,
     unique_values,
 )
-from ._sorting import argsort, sort
+from ._sorting import argsort, sort, top_k
 from ._testing import allclose
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
@@ -387,4 +387,5 @@
     "DLDeviceType",
     "take_along_axis",
     "put_along_axis",
+    "top_k",
 ]
@@ -14,6 +14,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import operator
+from typing import NamedTuple
+
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as du
@@ -24,6 +27,7 @@
     _argsort_descending,
     _sort_ascending,
     _sort_descending,
+    _topk,
 )
 from ._tensor_sorting_radix_impl import (
     _radix_argsort_ascending,
@@ -267,3 +271,166 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
         res = dpt.permute_dims(res, inv_perm)
     return res
+
+
+def _get_top_k_largest(mode):
+    modes = {"largest": True, "smallest": False}
+    try:
+        return modes[mode]
+    except KeyError:
+        raise ValueError(
+            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
+        )
+
+
+class TopKResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+
+
+def top_k(x, k, /, *, axis=None, mode="largest"):
+    """top_k(x, k, axis=None, mode="largest")
+
+    Returns the `k` largest or smallest values and their indices in the input
+    array `x` along the specified axis `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        k (int):
+            number of elements to find. Must be a positive integer value.
+        axis (Optional[int]):
+            axis along which to search. If `None`, the search will be performed
+            over the flattened array. Default: ``None``.
+        mode (Literal["largest", "smallest"]):
+            search mode. Must be one of the following modes:
+
+            - `"largest"`: return the `k` largest elements.
+            - `"smallest"`: return the `k` smallest elements.
+
+            Default: `"largest"`.
+
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices)` whose
+
+            * first element `values` will be an array containing the `k`
+              largest or smallest elements of `x`. The array has the same data
+              type as `x`. If `axis` was `None`, `values` will be a
+              one-dimensional array with shape `(k,)` and otherwise, `values`
+              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
+            * second element `indices` will be an array containing indices of
+              `x` that result in `values`. The array will have the same shape
+              as `values` and will have the default array index data type.
+    """
+    largest = _get_top_k_largest(mode)
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+
+    k = operator.index(k)
+    if k < 0:
+        raise ValueError("`k` must be a positive integer value")
+
+    nd = x.ndim
+    if axis is None:
+        sz = x.size
+        if nd == 0:
+            if k > 1:
+                raise ValueError(f"`k`={k} is out of bounds 1")
+            return TopKResult(
+                dpt.copy(x, order="C"),
+                dpt.zeros_like(
+                    x, dtype=ti.default_device_index_type(x.sycl_queue)
+                ),
+            )
+        arr = x
+        n_search_dims = None
+        res_sh = k
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+        sz = x.shape[axis]
+        a1 = axis + 1
+        if a1 == nd:
+            perm = list(range(nd))
+            arr = x
+        else:
+            perm = [i for i in range(nd) if i != axis] + [
+                axis,
+            ]
+            arr = dpt.permute_dims(x, perm)
+        n_search_dims = 1
+        res_sh = arr.shape[: nd - 1] + (k,)
+
+    if k > sz:
+        raise ValueError(f"`k`={k} is out of bounds {sz}")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    res_usm_type = arr.usm_type
+    if arr.flags.c_contiguous:
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=arr,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=tmp,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if axis is not None and a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        vals = dpt.permute_dims(vals, inv_perm)
+        inds = dpt.permute_dims(inds, inv_perm)
+
+    return TopKResult(vals, inds)
@@ -33,6 +33,7 @@
 
 #include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
 
 namespace dpctl
 {
@@ -811,20 +812,12 @@ sycl::event stable_argsort_axis1_contig_impl(
 
     const size_t total_nelems = iter_nelems * sort_nelems;
 
-    sycl::event populate_indexed_data_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
-            const sycl::range<1> range{total_nelems};
+    using IotaKernelName = populate_index_data_krn<argTy, IndexTy, ValueComp>;
 
-            using KernelName =
-                populate_index_data_krn<argTy, IndexTy, ValueComp>;
-
-            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
-                size_t i = id[0];
-                res_tp[i] = static_cast<IndexTy>(i);
-            });
-        });
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, res_tp, total_nelems, depends);
 
     // Sort segments of the array
     sycl::event base_sort_ev =
@@ -839,21 +832,11 @@ sycl::event stable_argsort_axis1_contig_impl(
         exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
         {base_sort_ev});
 
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(merges_ev);
-
-        auto temp_acc =
-            merge_sort_detail::GetReadOnlyAccess<decltype(res_tp)>{}(res_tp,
-                                                                     cgh);
-
-        using KernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
+    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
 
-        const sycl::range<1> range{total_nelems};
-
-        cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
-            res_tp[id] = (temp_acc[id] % sort_nelems);
-        });
-    });
+    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
 
     return write_out_ev;
 }
Original file line number	Diff line number	Diff line change
`@@ -10,3 +10,4 @@ Sorting functions`
`10`	`10`
`11`	`11`	`argsort`
`12`	`12`	`sort`
	`13`	`+ top_k`
Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,7 @@ set(_sorting_sources`
`115`	`115`	`${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp`
`116`	`116`	`${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp`
`117`	`117`	`${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp`
	`118`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp`
`118`	`119`	`)`
`119`	`120`	`set(_sorting_radix_sources`
`120`	`121`	`${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp`
Original file line number	Diff line number	Diff line change
`@@ -199,7 +199,7 @@`
`199`	`199`	`unique_inverse,`
`200`	`200`	`unique_values,`
`201`	`201`	`)`
`202`		`-from ._sorting import argsort, sort`
	`202`	`+from ._sorting import argsort, sort, top_k`
`203`	`203`	`from ._testing import allclose`
`204`	`204`	`from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type`
`205`	`205`
`@@ -387,4 +387,5 @@`
`387`	`387`	`"DLDeviceType",`
`388`	`388`	`"take_along_axis",`
`389`	`389`	`"put_along_axis",`
	`390`	`+ "top_k",`
`390`	`391`	`]`