From 6ad1c5f8f8bbc505e43af06a565406c21d717a05 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 2 Aug 2024 14:33:54 -0700 Subject: [PATCH 01/18] Support `kDLCPU` devices via DLPack protocol Leverages NumPy to create an array with Python interpreter/host-accessible memory, which is required by the 2023.12 array API specification --- dpctl/tensor/_dlpack.pxd | 6 +- dpctl/tensor/_dlpack.pyx | 196 ++++++++++++++++++++++++++++++++++++- dpctl/tensor/_usmarray.pyx | 24 ++++- 3 files changed, 216 insertions(+), 10 deletions(-) diff --git a/dpctl/tensor/_dlpack.pxd b/dpctl/tensor/_dlpack.pxd index 9846f54be6..5b4909943b 100644 --- a/dpctl/tensor/_dlpack.pxd +++ b/dpctl/tensor/_dlpack.pxd @@ -18,6 +18,8 @@ # cython: language_level=3 # cython: linetrace=True +from numpy cimport ndarray + from .._sycl_device cimport SyclDevice from ._usmarray cimport usm_ndarray @@ -40,7 +42,9 @@ cdef extern from 'dlpack/dlpack.h' nogil: cpdef object to_dlpack_capsule(usm_ndarray array) except + cpdef object to_dlpack_versioned_capsule(usm_ndarray array, bint copied) except + -cpdef usm_ndarray from_dlpack_capsule(object dltensor) except + +cpdef object numpy_to_dlpack_versioned_capsule(ndarray array, bint copied) except + +cpdef object from_dlpack_capsule(object dltensor) except + +cpdef object from_dlpack_versioned_capsule(object dltensor) except + cdef int get_parent_device_ordinal_id(SyclDevice dev) except * diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index ba2283eb50..43bb73c9df 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -21,6 +21,7 @@ cimport cpython from libc cimport stdlib from libc.stdint cimport int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t +from numpy cimport ndarray cimport dpctl as c_dpctl cimport dpctl.memory as c_dpmem @@ -34,6 +35,8 @@ from .._backend cimport ( ) from ._usmarray cimport USM_ARRAY_C_CONTIGUOUS, USM_ARRAY_WRITABLE, usm_ndarray +import ctypes + import numpy as np import dpctl @@ -475,6 +478,108 @@ cpdef to_dlpack_versioned_capsule(usm_ndarray usm_ary, bint copied): return cpython.PyCapsule_New(dlmv_tensor, 'dltensor_versioned', _pycapsule_versioned_deleter) +cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied): + """ + to_dlpack_versioned_capsule(npy_ary, copied) + + Constructs named Python capsule object referencing + instance of ``DLManagedTensorVersioned`` from + :class:`numpy.ndarray` instance. + + Args: + npy_ary: An instance of :class:`numpy.ndarray` + copied: A bint representing whether the data was previously + copied in order to set the flags with the is-copied + bitmask. + Returns: + A new capsule with name ``"dltensor_versioned"`` that + contains a pointer to ``DLManagedTensorVersioned`` struct. + Raises: + DLPackCreationError: when array can be represented as + DLPack tensor. + MemoryError: when host allocation to needed for + ``DLManagedTensorVersioned`` did not succeed. + ValueError: when array elements data type could not be represented + in ``DLManagedTensorVersioned``. + """ + cdef DLManagedTensorVersioned *dlmv_tensor = NULL + cdef DLTensor *dl_tensor = NULL + cdef uint32_t dlmv_flags = 0 + cdef int nd = npy_ary.ndim + cdef Py_ssize_t *shape_ptr = NULL + cdef Py_ssize_t *strides_ptr = NULL + cdef int64_t *shape_strides_ptr = NULL + cdef int i = 0 + cdef int device_id = -1 + cdef Py_ssize_t byte_offset = 0 + + dlmv_tensor = stdlib.malloc( + sizeof(DLManagedTensorVersioned)) + if dlmv_tensor is NULL: + raise MemoryError( + "to_dlpack_versioned_capsule: Could not allocate memory " + "for DLManagedTensorVersioned" + ) + shape_strides_ptr = stdlib.malloc((sizeof(int64_t) * 2) * nd) + if shape_strides_ptr is NULL: + stdlib.free(dlmv_tensor) + raise MemoryError( + "to_dlpack_versioned_capsule: Could not allocate memory " + "for shape/strides" + ) + # this can be a separate function for handling shapes and strides + shape = npy_ary.ctypes.shape_as(ctypes.c_int64) + strides = npy_ary.ctypes.strides_as(ctypes.c_int64) + for i in range(nd): + shape_strides_ptr[i] = shape[i] + shape_strides_ptr[nd + i] = strides[i] // npy_ary.itemsize + writable_flag = npy_ary.flags["W"] + + ary_dt = npy_ary.dtype + ary_dtk = ary_dt.kind + + dl_tensor = &dlmv_tensor.dl_tensor + dl_tensor.data = npy_ary.data + dl_tensor.ndim = nd + dl_tensor.byte_offset = byte_offset + dl_tensor.shape = &shape_strides_ptr[0] + dl_tensor.strides = &shape_strides_ptr[nd] + dl_tensor.device.device_type = kDLCPU + dl_tensor.device.device_id = 0 + dl_tensor.dtype.lanes = 1 + dl_tensor.dtype.bits = (ary_dt.itemsize * 8) + if (ary_dtk == "b"): + dl_tensor.dtype.code = kDLBool + elif (ary_dtk == "u"): + dl_tensor.dtype.code = kDLUInt + elif (ary_dtk == "i"): + dl_tensor.dtype.code = kDLInt + elif (ary_dtk == "f" and ary_dt.itemsize <= 8): + dl_tensor.dtype.code = kDLFloat + elif (ary_dtk == "c" and ary_dt.itemsize <= 16): + dl_tensor.dtype.code = kDLComplex + else: + stdlib.free(shape_strides_ptr) + stdlib.free(dlmv_tensor) + raise ValueError("Unrecognized array data type") + + # set flags down here + if copied: + dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED + if not writable_flag: + dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY + dlmv_tensor.flags = dlmv_flags + + dlmv_tensor.version.major = DLPACK_MAJOR_VERSION + dlmv_tensor.version.minor = DLPACK_MINOR_VERSION + + dlmv_tensor.manager_ctx = npy_ary + cpython.Py_INCREF(npy_ary) + dlmv_tensor.deleter = _managed_tensor_versioned_deleter + + return cpython.PyCapsule_New(dlmv_tensor, 'dltensor_versioned', _pycapsule_versioned_deleter) + + cdef class _DLManagedTensorOwner: """ Helper class managing the lifetime of the DLManagedTensor struct @@ -519,9 +624,81 @@ cdef class _DLManagedTensorVersionedOwner: return res -cpdef usm_ndarray from_dlpack_capsule(object py_caps): +cdef dict _numpy_array_interface_from_dl_tensor(DLTensor dlt, bint ro_flag): + """Constructs a NumPy `__array_interface__` dictionary from a DLTensor.""" + cdef int i = 0 + cdef int itemsize = 0 + + if dlt.dtype.lanes != 1: + raise BufferError( + "Can not import DLPack tensor with lanes != 1" + ) + itemsize = dlt.dtype.bits // 8 + shape = list() + if (dlt.strides is NULL): + strides = None + for dim in range(dlt.ndim): + shape.append(dlt.shape[dim]) + else: + strides = list() + for dim in range(dlt.ndim): + shape.append(dlt.shape[dim]) + # convert to byte-strides + strides.append(dlt.strides[dim] * itemsize) + strides = tuple(strides) + shape = tuple(shape) + if (dlt.dtype.code == kDLUInt): + ary_dt = "u" + str(itemsize) + elif (dlt.dtype.code == kDLInt): + ary_dt = "i" + str(itemsize) + elif (dlt.dtype.code == kDLFloat): + ary_dt = "f" + str(itemsize) + elif (dlt.dtype.code == kDLComplex): + ary_dt = "c" + str(itemsize) + elif (dlt.dtype.code == kDLBool): + ary_dt = np.dtype("?") + else: + raise BufferError( + "Can not import DLPack tensor with type code {}.".format( + dlt.dtype.code + ) + ) + typestr = "|" + ary_dt + return dict( + version=3, + shape=shape, + strides=strides, + data=( dlt.data, True if ro_flag else False), + offset=dlt.byte_offset, + typestr=typestr, + ) + + +class _numpy_array_interface_wrapper: + """ + Class that wraps a Python capsule and dictionary for consumption by NumPy. + + Implementation taken from + https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/to_numpy.py + + Args: + array_interface: + A dictionary describing the underlying memory. Formatted + to match `numpy.ndarray.__array_interface__`. + + pycapsule: + A Python capsule wrapping the dlpack tensor that will be + converted to numpy. """ - from_dlpack_capsule(caps) + + def __init__(self, array_interface, pycapsule) -> None: + self.__array_interface__ = array_interface + self._pycapsule = pycapsule + + +cpdef object from_dlpack_capsule(object py_caps): + """ + from_dlpack_capsule(py_caps) Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from named Python capsule object referencing instance of ``DLManagedTensor`` @@ -693,15 +870,20 @@ cpdef usm_ndarray from_dlpack_capsule(object py_caps): offset=element_offset ) return res_ary + elif dlm_tensor.dl_tensor.device.device_type == kDLCPU: + ary_iface = _numpy_array_interface_from_dl_tensor(dlm_tensor.dl_tensor, False) + dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) + cpython.PyCapsule_SetName(py_caps, 'used_dltensor') + return np.ctypeslib.as_array(_numpy_array_interface_wrapper(ary_iface, py_caps)) else: raise BufferError( "The DLPack tensor resides on unsupported device." ) -cpdef usm_ndarray from_dlpack_versioned_capsule(object py_caps): +cpdef object from_dlpack_versioned_capsule(object py_caps): """ - from_dlpack_versioned_capsule(caps) + from_dlpack_versioned_capsule(py_caps) Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from named Python capsule object referencing instance of @@ -883,6 +1065,12 @@ cpdef usm_ndarray from_dlpack_versioned_capsule(object py_caps): if (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY): res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE) return res_ary + elif dlmv_tensor.dl_tensor.device.device_type == kDLCPU: + ro_flag = dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY + ary_iface = _numpy_array_interface_from_dl_tensor(dlmv_tensor.dl_tensor, ro_flag) + dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) + cpython.PyCapsule_SetName(py_caps, 'used_dltensor_versioned') + return np.ctypeslib.as_array(_numpy_array_interface_wrapper(ary_iface, py_caps)) else: raise BufferError( "The DLPack tensor resides on unsupported device." diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 443d8184a2..7266efa3e6 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1142,13 +1142,27 @@ cdef class usm_ndarray: dpctl_dlpack_version = get_build_dlpack_version() if max_version[0] >= dpctl_dlpack_version[0]: # DLManagedTensorVersioned path - # TODO: add logic for targeting a device if dl_device is not None: if dl_device != self.__dlpack_device__(): - raise NotImplementedError( - "targeting a device with `__dlpack__` is not " - "currently implemented" - ) + if copy == False: + raise BufferError( + "array cannot be placed on the requested device without a copy" + ) + if dl_device[0] == (DLDeviceType.kDLCPU): + assert dl_device[1] == 0 + if stream is not None: + raise ValueError( + "`stream` must be `None` when `dl_device` is of type `kDLCPU`" + ) + from ._copy_utils import _copy_to_numpy + _arr = _copy_to_numpy(self) + _arr.flags["W"] = self.flags["W"] + return c_dlpack.numpy_to_dlpack_versioned_capsule(_arr, True) + else: + raise NotImplementedError( + f"targeting `dl_device` {dl_device} with `__dlpack__` is not " + "yet implemented" + ) if copy is None: copy = False # TODO: strategy for handling stream on different device from dl_device From a250506a936b03df5ae38f7f83e103b9578aba3e Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 2 Aug 2024 14:36:15 -0700 Subject: [PATCH 02/18] Refactor _dlpack.pyx functions Refactors `from_dlpack_versioned_capsule` into `from_dlpack_capsule` `_numpy_array_interface_from_dl_tensor` now takes a pointer to a DLTensor to guarantee no copy is made --- dpctl/tensor/_dlpack.pxd | 1 - dpctl/tensor/_dlpack.pyx | 313 ++++++------------------- dpctl/tests/test_usm_ndarray_dlpack.py | 20 +- 3 files changed, 82 insertions(+), 252 deletions(-) diff --git a/dpctl/tensor/_dlpack.pxd b/dpctl/tensor/_dlpack.pxd index 5b4909943b..13abfa3453 100644 --- a/dpctl/tensor/_dlpack.pxd +++ b/dpctl/tensor/_dlpack.pxd @@ -44,7 +44,6 @@ cpdef object to_dlpack_capsule(usm_ndarray array) except + cpdef object to_dlpack_versioned_capsule(usm_ndarray array, bint copied) except + cpdef object numpy_to_dlpack_versioned_capsule(ndarray array, bint copied) except + cpdef object from_dlpack_capsule(object dltensor) except + -cpdef object from_dlpack_versioned_capsule(object dltensor) except + cdef int get_parent_device_ordinal_id(SyclDevice dev) except * diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index 43bb73c9df..defdb64394 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -624,7 +624,7 @@ cdef class _DLManagedTensorVersionedOwner: return res -cdef dict _numpy_array_interface_from_dl_tensor(DLTensor dlt, bint ro_flag): +cdef dict _numpy_array_interface_from_dl_tensor(DLTensor *dlt, bint ro_flag): """Constructs a NumPy `__array_interface__` dictionary from a DLTensor.""" cdef int i = 0 cdef int itemsize = 0 @@ -722,7 +722,11 @@ cpdef object from_dlpack_capsule(object py_caps): sycl context, or the DLPack's device_type is not supported by :mod:`dpctl`. """ + cdef DLManagedTensorVersioned *dlmv_tensor = NULL cdef DLManagedTensor *dlm_tensor = NULL + cdef DLTensor *dl_tensor = NULL + cdef int versioned = 0 + cdef int readonly = 0 cdef bytes usm_type cdef size_t sz = 1 cdef size_t alloc_sz = 1 @@ -737,219 +741,44 @@ cpdef object from_dlpack_capsule(object py_caps): cdef int64_t stride_i = -1 cdef int64_t shape_i = -1 - if not cpython.PyCapsule_IsValid(py_caps, 'dltensor'): - if cpython.PyCapsule_IsValid(py_caps, 'used_dltensor'): - raise ValueError( - "A DLPack tensor object can not be consumed multiple times" - ) - else: - raise TypeError( - "`from_dlpack_capsule` expects a Python 'dltensor' capsule" - ) - dlm_tensor = cpython.PyCapsule_GetPointer( - py_caps, "dltensor") - # Verify that we can work with this device - if dlm_tensor.dl_tensor.device.device_type == kDLOneAPI: - device_id = dlm_tensor.dl_tensor.device.device_id - root_device = dpctl.SyclDevice(str(device_id)) - try: - default_context = root_device.sycl_platform.default_context - except RuntimeError: - default_context = get_device_cached_queue(root_device).sycl_context - if dlm_tensor.dl_tensor.data is NULL: - usm_type = b"device" - q = get_device_cached_queue((default_context, root_device,)) - else: - usm_type = c_dpmem._Memory.get_pointer_type( - dlm_tensor.dl_tensor.data, - default_context) - if usm_type == b"unknown": - raise BufferError( - "Data pointer in DLPack is not bound to default sycl " - f"context of device '{device_id}', translated to " - f"{root_device.filter_string}" - ) - alloc_device = c_dpmem._Memory.get_pointer_device( - dlm_tensor.dl_tensor.data, - default_context - ) - q = get_device_cached_queue((default_context, alloc_device,)) - if dlm_tensor.dl_tensor.dtype.bits % 8: - raise BufferError( - "Can not import DLPack tensor whose element's " - "bitsize is not a multiple of 8" - ) - if dlm_tensor.dl_tensor.dtype.lanes != 1: - raise BufferError( - "Can not import DLPack tensor with lanes != 1" - ) - offset_min = 0 - if dlm_tensor.dl_tensor.strides is NULL: - for i in range(dlm_tensor.dl_tensor.ndim): - sz = sz * dlm_tensor.dl_tensor.shape[i] - offset_max = sz - 1 - else: - offset_max = 0 - for i in range(dlm_tensor.dl_tensor.ndim): - stride_i = dlm_tensor.dl_tensor.strides[i] - shape_i = dlm_tensor.dl_tensor.shape[i] - if shape_i > 1: - shape_i -= 1 - if stride_i > 0: - offset_max = offset_max + stride_i * shape_i - else: - offset_min = offset_min + stride_i * shape_i - sz = offset_max - offset_min + 1 - if sz == 0: - sz = 1 - - element_bytesize = (dlm_tensor.dl_tensor.dtype.bits // 8) - sz = sz * element_bytesize - element_offset = dlm_tensor.dl_tensor.byte_offset // element_bytesize - - # transfer dlm_tensor ownership - dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) - cpython.PyCapsule_SetName(py_caps, 'used_dltensor') - - if dlm_tensor.dl_tensor.data is NULL: - usm_mem = dpmem.MemoryUSMDevice(sz, q) - else: - mem_ptr_delta = dlm_tensor.dl_tensor.byte_offset - ( - element_offset * element_bytesize - ) - mem_ptr = dlm_tensor.dl_tensor.data - alloc_sz = dlm_tensor.dl_tensor.byte_offset + ( - (offset_max + 1) * element_bytesize) - tmp = c_dpmem._Memory.create_from_usm_pointer_size_qref( - mem_ptr, - max(alloc_sz, element_bytesize), - (q).get_queue_ref(), - memory_owner=dlm_holder - ) - if mem_ptr_delta == 0: - usm_mem = tmp - else: - alloc_sz = dlm_tensor.dl_tensor.byte_offset + ( - (offset_max * element_bytesize + mem_ptr_delta)) - usm_mem = c_dpmem._Memory.create_from_usm_pointer_size_qref( - (mem_ptr + (element_bytesize - mem_ptr_delta)), - max(alloc_sz, element_bytesize), - (q).get_queue_ref(), - memory_owner=tmp - ) - py_shape = list() - for i in range(dlm_tensor.dl_tensor.ndim): - py_shape.append(dlm_tensor.dl_tensor.shape[i]) - if (dlm_tensor.dl_tensor.strides is NULL): - py_strides = None - else: - py_strides = list() - for i in range(dlm_tensor.dl_tensor.ndim): - py_strides.append(dlm_tensor.dl_tensor.strides[i]) - if (dlm_tensor.dl_tensor.dtype.code == kDLUInt): - ary_dt = np.dtype("u" + str(element_bytesize)) - elif (dlm_tensor.dl_tensor.dtype.code == kDLInt): - ary_dt = np.dtype("i" + str(element_bytesize)) - elif (dlm_tensor.dl_tensor.dtype.code == kDLFloat): - ary_dt = np.dtype("f" + str(element_bytesize)) - elif (dlm_tensor.dl_tensor.dtype.code == kDLComplex): - ary_dt = np.dtype("c" + str(element_bytesize)) - elif (dlm_tensor.dl_tensor.dtype.code == kDLBool): - ary_dt = np.dtype("?") - else: + if cpython.PyCapsule_IsValid(py_caps, 'dltensor'): + dlm_tensor = cpython.PyCapsule_GetPointer( + py_caps, "dltensor") + dl_tensor = &dlm_tensor.dl_tensor + elif cpython.PyCapsule_IsValid(py_caps, 'dltensor_versioned'): + dlmv_tensor = cpython.PyCapsule_GetPointer( + py_caps, "dltensor_versioned") + if dlmv_tensor.version.major > DLPACK_MAJOR_VERSION: raise BufferError( - "Can not import DLPack tensor with type code {}.".format( - dlm_tensor.dl_tensor.dtype.code - ) + "Can not import DLPack tensor with major version " + f"greater than {DLPACK_MAJOR_VERSION}" ) - res_ary = usm_ndarray( - py_shape, - dtype=ary_dt, - buffer=usm_mem, - strides=py_strides, - offset=element_offset + versioned = 1 + readonly = (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0 + dl_tensor = &dlmv_tensor.dl_tensor + elif cpython.PyCapsule_IsValid(py_caps, 'used_dltensor') or cpython.PyCapsule_IsValid(py_caps, 'used_dltensor_versioned'): + raise ValueError( + "A DLPack tensor object can not be consumed multiple times" ) - return res_ary - elif dlm_tensor.dl_tensor.device.device_type == kDLCPU: - ary_iface = _numpy_array_interface_from_dl_tensor(dlm_tensor.dl_tensor, False) - dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) - cpython.PyCapsule_SetName(py_caps, 'used_dltensor') - return np.ctypeslib.as_array(_numpy_array_interface_wrapper(ary_iface, py_caps)) else: - raise BufferError( - "The DLPack tensor resides on unsupported device." + raise TypeError( + "`from_dlpack_capsule` expects a Python 'dltensor' capsule" ) - -cpdef object from_dlpack_versioned_capsule(object py_caps): - """ - from_dlpack_versioned_capsule(py_caps) - - Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from - named Python capsule object referencing instance of - ``DLManagedTensorVersioned`` without copy. The instance forms a - view in the memory of the tensor. - - Args: - caps: - Python capsule with name ``"dltensor_versioned"`` expected - to reference an instance of ``DLManagedTensorVersioned`` - struct. - Returns: - Instance of :class:`dpctl.tensor.usm_ndarray` with a view into - memory of the tensor. Capsule is renamed to - ``"used_dltensor_versioned"`` upon success. - Raises: - TypeError: - if argument is not a ``"dltensor_versioned"`` capsule. - ValueError: - if argument is ``"used_dltensor_versioned"`` capsule - BufferError: - if the USM pointer is not bound to the reconstructed - sycl context, or the DLPack's device_type is not supported - by :mod:`dpctl`. - """ - cdef DLManagedTensorVersioned *dlmv_tensor = NULL - cdef bytes usm_type - cdef size_t sz = 1 - cdef size_t alloc_sz = 1 - cdef int i - cdef int device_id = -1 - cdef int element_bytesize = 0 - cdef Py_ssize_t offset_min = 0 - cdef Py_ssize_t offset_max = 0 - cdef char *mem_ptr = NULL - cdef Py_ssize_t mem_ptr_delta = 0 - cdef Py_ssize_t element_offset = 0 - cdef int64_t stride_i = -1 - cdef int64_t shape_i = -1 - - if not cpython.PyCapsule_IsValid(py_caps, 'dltensor_versioned'): - if cpython.PyCapsule_IsValid(py_caps, 'used_dltensor_versioned'): - raise ValueError( - "A DLPack tensor object can not be consumed multiple times" - ) - else: - raise TypeError( - "`from_dlpack_versioned_capsule` expects a Python " - "'dltensor_versioned' capsule" - ) - dlmv_tensor = cpython.PyCapsule_GetPointer( - py_caps, "dltensor_versioned") # Verify that we can work with this device - if dlmv_tensor.dl_tensor.device.device_type == kDLOneAPI: - device_id = dlmv_tensor.dl_tensor.device.device_id + if dl_tensor.device.device_type == kDLOneAPI: + device_id = dl_tensor.device.device_id root_device = dpctl.SyclDevice(str(device_id)) try: default_context = root_device.sycl_platform.default_context except RuntimeError: default_context = get_device_cached_queue(root_device).sycl_context - if dlmv_tensor.dl_tensor.data is NULL: + if dl_tensor.data is NULL: usm_type = b"device" q = get_device_cached_queue((default_context, root_device,)) else: usm_type = c_dpmem._Memory.get_pointer_type( - dlmv_tensor.dl_tensor.data, + dl_tensor.data, default_context) if usm_type == b"unknown": raise BufferError( @@ -958,34 +787,29 @@ cpdef object from_dlpack_versioned_capsule(object py_caps): f"{root_device.filter_string}" ) alloc_device = c_dpmem._Memory.get_pointer_device( - dlmv_tensor.dl_tensor.data, + dl_tensor.data, default_context ) q = get_device_cached_queue((default_context, alloc_device,)) - if dlmv_tensor.dl_tensor.dtype.bits % 8: + if dl_tensor.dtype.bits % 8: raise BufferError( "Can not import DLPack tensor whose element's " "bitsize is not a multiple of 8" ) - if dlmv_tensor.dl_tensor.dtype.lanes != 1: + if dl_tensor.dtype.lanes != 1: raise BufferError( "Can not import DLPack tensor with lanes != 1" ) - if dlmv_tensor.version.major > DLPACK_MAJOR_VERSION: - raise BufferError( - "Can not import DLPack tensor with major version " - f"greater than {DLPACK_MAJOR_VERSION}" - ) offset_min = 0 - if dlmv_tensor.dl_tensor.strides is NULL: - for i in range(dlmv_tensor.dl_tensor.ndim): - sz = sz * dlmv_tensor.dl_tensor.shape[i] + if dl_tensor.strides is NULL: + for i in range(dl_tensor.ndim): + sz = sz * dl_tensor.shape[i] offset_max = sz - 1 else: offset_max = 0 - for i in range(dlmv_tensor.dl_tensor.ndim): - stride_i = dlmv_tensor.dl_tensor.strides[i] - shape_i = dlmv_tensor.dl_tensor.shape[i] + for i in range(dl_tensor.ndim): + stride_i = dl_tensor.strides[i] + shape_i = dl_tensor.shape[i] if shape_i > 1: shape_i -= 1 if stride_i > 0: @@ -996,33 +820,37 @@ cpdef object from_dlpack_versioned_capsule(object py_caps): if sz == 0: sz = 1 - element_bytesize = (dlmv_tensor.dl_tensor.dtype.bits // 8) + element_bytesize = (dl_tensor.dtype.bits // 8) sz = sz * element_bytesize - element_offset = dlmv_tensor.dl_tensor.byte_offset // element_bytesize + element_offset = dl_tensor.byte_offset // element_bytesize - # transfer dlmv_tensor ownership - dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) - cpython.PyCapsule_SetName(py_caps, 'used_dltensor_versioned') + # transfer ownership + if not versioned: + dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) + cpython.PyCapsule_SetName(py_caps, 'used_dltensor') + else: + dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) + cpython.PyCapsule_SetName(py_caps, 'used_dltensor_versioned') - if dlmv_tensor.dl_tensor.data is NULL: + if dl_tensor.data is NULL: usm_mem = dpmem.MemoryUSMDevice(sz, q) else: - mem_ptr_delta = dlmv_tensor.dl_tensor.byte_offset - ( + mem_ptr_delta = dl_tensor.byte_offset - ( element_offset * element_bytesize ) - mem_ptr = dlmv_tensor.dl_tensor.data - alloc_sz = dlmv_tensor.dl_tensor.byte_offset + ( + mem_ptr = dl_tensor.data + alloc_sz = dl_tensor.byte_offset + ( (offset_max + 1) * element_bytesize) tmp = c_dpmem._Memory.create_from_usm_pointer_size_qref( mem_ptr, max(alloc_sz, element_bytesize), (q).get_queue_ref(), - memory_owner=dlmv_holder + memory_owner=dlmv_holder if versioned else dlm_holder ) if mem_ptr_delta == 0: usm_mem = tmp else: - alloc_sz = dlmv_tensor.dl_tensor.byte_offset + ( + alloc_sz = dl_tensor.byte_offset + ( (offset_max * element_bytesize + mem_ptr_delta)) usm_mem = c_dpmem._Memory.create_from_usm_pointer_size_qref( (mem_ptr + (element_bytesize - mem_ptr_delta)), @@ -1031,28 +859,28 @@ cpdef object from_dlpack_versioned_capsule(object py_caps): memory_owner=tmp ) py_shape = list() - for i in range(dlmv_tensor.dl_tensor.ndim): - py_shape.append(dlmv_tensor.dl_tensor.shape[i]) - if (dlmv_tensor.dl_tensor.strides is NULL): + for i in range(dl_tensor.ndim): + py_shape.append(dl_tensor.shape[i]) + if (dl_tensor.strides is NULL): py_strides = None else: py_strides = list() - for i in range(dlmv_tensor.dl_tensor.ndim): - py_strides.append(dlmv_tensor.dl_tensor.strides[i]) - if (dlmv_tensor.dl_tensor.dtype.code == kDLUInt): + for i in range(dl_tensor.ndim): + py_strides.append(dl_tensor.strides[i]) + if (dl_tensor.dtype.code == kDLUInt): ary_dt = np.dtype("u" + str(element_bytesize)) - elif (dlmv_tensor.dl_tensor.dtype.code == kDLInt): + elif (dl_tensor.dtype.code == kDLInt): ary_dt = np.dtype("i" + str(element_bytesize)) - elif (dlmv_tensor.dl_tensor.dtype.code == kDLFloat): + elif (dl_tensor.dtype.code == kDLFloat): ary_dt = np.dtype("f" + str(element_bytesize)) - elif (dlmv_tensor.dl_tensor.dtype.code == kDLComplex): + elif (dl_tensor.dtype.code == kDLComplex): ary_dt = np.dtype("c" + str(element_bytesize)) - elif (dlmv_tensor.dl_tensor.dtype.code == kDLBool): + elif (dl_tensor.dtype.code == kDLBool): ary_dt = np.dtype("?") else: raise BufferError( "Can not import DLPack tensor with type code {}.".format( - dlmv_tensor.dl_tensor.dtype.code + dl_tensor.dtype.code ) ) res_ary = usm_ndarray( @@ -1062,14 +890,17 @@ cpdef object from_dlpack_versioned_capsule(object py_caps): strides=py_strides, offset=element_offset ) - if (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY): + if readonly: res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE) return res_ary - elif dlmv_tensor.dl_tensor.device.device_type == kDLCPU: - ro_flag = dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY - ary_iface = _numpy_array_interface_from_dl_tensor(dlmv_tensor.dl_tensor, ro_flag) - dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) - cpython.PyCapsule_SetName(py_caps, 'used_dltensor_versioned') + elif dl_tensor.device.device_type == kDLCPU: + ary_iface = _numpy_array_interface_from_dl_tensor(dl_tensor, readonly) + if not versioned: + dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) + cpython.PyCapsule_SetName(py_caps, 'used_dltensor') + else: + dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) + cpython.PyCapsule_SetName(py_caps, 'used_dltensor_versioned') return np.ctypeslib.as_array(_numpy_array_interface_wrapper(ary_iface, py_caps)) else: raise BufferError( @@ -1175,7 +1006,7 @@ def from_dlpack(x, /, *, device=None, copy=None): else: dl_device = (device_OneAPI, get_parent_device_ordinal_id(device)) dlpack_capsule = dlpack_attr(max_version=get_build_dlpack_version(), dl_device=dl_device, copy=copy) - return from_dlpack_versioned_capsule(dlpack_capsule) + return from_dlpack_capsule(dlpack_capsule) except TypeError: dlpack_capsule = dlpack_attr() return from_dlpack_capsule(dlpack_capsule) diff --git a/dpctl/tests/test_usm_ndarray_dlpack.py b/dpctl/tests/test_usm_ndarray_dlpack.py index a4994f01e6..527116b1d0 100644 --- a/dpctl/tests/test_usm_ndarray_dlpack.py +++ b/dpctl/tests/test_usm_ndarray_dlpack.py @@ -301,20 +301,20 @@ def test_versioned_dlpack_capsule(): max_supported_ver = _dlp.get_build_dlpack_version() cap = x.__dlpack__(max_version=max_supported_ver) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) del cap assert x._pointer == y._pointer x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F") cap = x2.__dlpack__(max_version=max_supported_ver) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) del cap assert x2._pointer == y._pointer del x2 x3 = x[::-2] cap = x3.__dlpack__(max_version=max_supported_ver) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) assert x3._pointer == y._pointer del x3, y, x del cap @@ -323,13 +323,13 @@ def test_versioned_dlpack_capsule(): x = dpt.arange(100, dtype="i4") x.flags["W"] = False cap = x.__dlpack__(max_version=max_supported_ver) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) assert x._pointer == y._pointer assert not y.flags.writable # read-only array, and copy cap = x.__dlpack__(max_version=max_supported_ver, copy=True) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) assert x._pointer != y._pointer assert not y.flags.writable @@ -399,12 +399,12 @@ def test_used_dlpack_capsule(): max_supported_ver = _dlp.get_build_dlpack_version() cap = x.__dlpack__(max_version=max_supported_ver) - _dlp.from_dlpack_versioned_capsule(cap) + _dlp.from_dlpack_capsule(cap) with pytest.raises( ValueError, match="A DLPack tensor object can not be consumed multiple times", ): - _dlp.from_dlpack_versioned_capsule(cap) + _dlp.from_dlpack_capsule(cap) del cap @@ -421,7 +421,7 @@ def test_dlpack_size_0(): max_supported_ver = _dlp.get_build_dlpack_version() cap = x.__dlpack__(max_version=max_supported_ver) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) assert y._pointer == x._pointer @@ -459,14 +459,14 @@ def test_dlpack_kwargs(): x1 = dpt.arange(100, dtype="i4", sycl_queue=q1) max_supported_ver = _dlp.get_build_dlpack_version() cap = x1.__dlpack__(stream=q2, max_version=max_supported_ver, copy=False) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) assert y._pointer == x1._pointer del x1, y del cap x2 = dpt.arange(100, dtype="i4", sycl_queue=q1) cap = x2.__dlpack__(stream=q2, max_version=max_supported_ver, copy=True) - y = _dlp.from_dlpack_versioned_capsule(cap) + y = _dlp.from_dlpack_capsule(cap) assert y._pointer != x2._pointer del x2, y del cap From d326314923ee77543416a6b80897c735d9e8d9f4 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 2 Aug 2024 14:43:00 -0700 Subject: [PATCH 03/18] Fix Cython warning caused by cimporting from Numpy When using cimport with Numpy, a warning for a deprecated C-API is thrown NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION is now defined for targets in `build_dpctl_ext` to address this warning --- dpctl/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index d481f0fb41..6609dc7d60 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -125,6 +125,8 @@ function(build_dpctl_ext _trgt _src _dest) ) endif() endif() + # needed to prevent warnings from Cython caused by cimporting from Numpy + target_compile_definitions(${_trgt} PRIVATE NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION) target_link_libraries(${_trgt} PRIVATE Python::NumPy) if (DPCTL_GENERATE_COVERAGE) target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) From 1ec9a76a81f97e0fefd9649a4c4d0e260a8a5b2e Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 2 Aug 2024 20:20:04 -0700 Subject: [PATCH 04/18] Handle Numpy deprecated API without touching CMake Instead cdef from `numpy/npy_no_deprecated_api.h` --- dpctl/CMakeLists.txt | 2 -- dpctl/tensor/_dlpack.pxd | 2 ++ dpctl/tensor/_dlpack.pyx | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index 6609dc7d60..d481f0fb41 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -125,8 +125,6 @@ function(build_dpctl_ext _trgt _src _dest) ) endif() endif() - # needed to prevent warnings from Cython caused by cimporting from Numpy - target_compile_definitions(${_trgt} PRIVATE NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION) target_link_libraries(${_trgt} PRIVATE Python::NumPy) if (DPCTL_GENERATE_COVERAGE) target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) diff --git a/dpctl/tensor/_dlpack.pxd b/dpctl/tensor/_dlpack.pxd index 13abfa3453..81ecf16967 100644 --- a/dpctl/tensor/_dlpack.pxd +++ b/dpctl/tensor/_dlpack.pxd @@ -18,6 +18,8 @@ # cython: language_level=3 # cython: linetrace=True +cdef extern from "numpy/npy_no_deprecated_api.h": + pass from numpy cimport ndarray from .._sycl_device cimport SyclDevice diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index defdb64394..bdef159014 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -18,6 +18,9 @@ # cython: language_level=3 # cython: linetrace=True +cdef extern from "numpy/npy_no_deprecated_api.h": + pass + cimport cpython from libc cimport stdlib from libc.stdint cimport int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t From 7b6a651e57f5fb5238e4710644a1d9e14c0a091d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 5 Aug 2024 07:45:42 -0700 Subject: [PATCH 05/18] Introduce _is_host_cpu utility predicate used in __dlpack__ _is_host_cpu(dl_device) checks if user request export for host CPU device. Recognized inputs are (1, 0) (_usmarray.DLDeviceType.kDLCPU, 0) ("kDLCPU", 0) Add test to exercise __dlpack__ with non-default dl_device keyword arguments --- dpctl/tensor/_usmarray.pyx | 27 ++++++++++++++++++-- dpctl/tests/test_usm_ndarray_dlpack.py | 35 ++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 7266efa3e6..6a9b775e80 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -87,10 +87,34 @@ cdef object _as_zero_dim_ndarray(object usm_ary): view.shape = tuple() return view + cdef int _copy_writable(int lhs_flags, int rhs_flags): "Copy the WRITABLE flag to lhs_flags from rhs_flags" return (lhs_flags & ~USM_ARRAY_WRITABLE) | (rhs_flags & USM_ARRAY_WRITABLE) + +cdef bint _is_host_cpu(object dl_device): + "Check if dl_device denotes (kDLCPU, 0)" + cdef object dl_type + cdef object dl_id + cdef Py_ssize_t n_elems = -1 + + try: + n_elems = len(dl_device) + except TypeError: + pass + + if n_elems != 2: + return False + + dl_type = dl_device[0] + dl_id = dl_device[1] + if isinstance(dl_type, str): + return (dl_type == "kDLCPU" and dl_id == 0) + + return (dl_type == DLDeviceType.kDLCPU) and (dl_id == 0) + + cdef class usm_ndarray: """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \ offset=0, order="C", buffer_ctor_kwargs=dict(), \ @@ -1148,8 +1172,7 @@ cdef class usm_ndarray: raise BufferError( "array cannot be placed on the requested device without a copy" ) - if dl_device[0] == (DLDeviceType.kDLCPU): - assert dl_device[1] == 0 + if _is_host_cpu(dl_device): if stream is not None: raise ValueError( "`stream` must be `None` when `dl_device` is of type `kDLCPU`" diff --git a/dpctl/tests/test_usm_ndarray_dlpack.py b/dpctl/tests/test_usm_ndarray_dlpack.py index 527116b1d0..e8d7885b1a 100644 --- a/dpctl/tests/test_usm_ndarray_dlpack.py +++ b/dpctl/tests/test_usm_ndarray_dlpack.py @@ -23,6 +23,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._dlpack as _dlp +import dpctl.tensor._usmarray as dpt_arr device_oneAPI = 14 # DLDeviceType.kDLOneAPI @@ -470,3 +471,37 @@ def test_dlpack_kwargs(): assert y._pointer != x2._pointer del x2, y del cap + + +def _is_capsule(o): + t = type(o) + return t.__module__ == "builtins" and t.__name__ == "PyCapsule" + + +def test_dlpack_dl_device(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + max_supported_ver = _dlp.get_build_dlpack_version() + cap1 = x.__dlpack__( + dl_device=x.__dlpack_device__(), max_version=max_supported_ver + ) + assert _is_capsule(cap1) + cap2 = x.__dlpack__(dl_device=(1, 0), max_version=max_supported_ver) + assert _is_capsule(cap2) + cap3 = x.__dlpack__( + dl_device=(dpt_arr.DLDeviceType.kDLCPU, 0), + max_version=max_supported_ver, + ) + assert _is_capsule(cap3) + cap4 = x.__dlpack__(dl_device=("kDLCPU", 0), max_version=max_supported_ver) + assert _is_capsule(cap4) + with pytest.raises(NotImplementedError): + # pass method instead of return of its __call__ invocation + x.__dlpack__( + dl_device=x.__dlpack_device__, max_version=max_supported_ver + ) + with pytest.raises(NotImplementedError): + # exercise check for length + x.__dlpack__(dl_device=(3,), max_version=max_supported_ver) From c3655ed8a0d2449bfa7d83de314bf0538fd5849e Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sun, 4 Aug 2024 23:59:02 -0700 Subject: [PATCH 06/18] Fixes a typo when making NumPy array interface from a boolean array in `from_dlpack` --- dpctl/tensor/_dlpack.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index bdef159014..b9ed3f3cee 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -659,7 +659,7 @@ cdef dict _numpy_array_interface_from_dl_tensor(DLTensor *dlt, bint ro_flag): elif (dlt.dtype.code == kDLComplex): ary_dt = "c" + str(itemsize) elif (dlt.dtype.code == kDLBool): - ary_dt = np.dtype("?") + ary_dt = "b" + str(itemsize) else: raise BufferError( "Can not import DLPack tensor with type code {}.".format( From cb80f559c84cf72d81435d62d195d9e6b2049f7a Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 5 Aug 2024 10:39:49 -0700 Subject: [PATCH 07/18] Adds validation for `dl_device` argument in `__dlpack__` --- dpctl/tensor/_usmarray.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 6a9b775e80..ae7f68e51f 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1167,6 +1167,12 @@ cdef class usm_ndarray: if max_version[0] >= dpctl_dlpack_version[0]: # DLManagedTensorVersioned path if dl_device is not None: + if not isinstance(dl_device, tuple) or len(dl_device) != 2: + raise TypeError( + "`__dlpack__` expects `dl_device` to be a " + "2-tuple of `(device_type, device_id)`, instead " + f"got {type(dl_device)}" + ) if dl_device != self.__dlpack_device__(): if copy == False: raise BufferError( From 3c35c3b5a3770090217361fba9b8ffbac058bebb Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 5 Aug 2024 14:20:11 -0700 Subject: [PATCH 08/18] Chane per PR review by @oleksandr-pavlyk and pass NULL strides to dl_tensor if NumPy array is C-contiguous --- dpctl/tensor/_dlpack.pyx | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index b9ed3f3cee..7136761918 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -515,27 +515,43 @@ cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied): cdef int i = 0 cdef int device_id = -1 cdef Py_ssize_t byte_offset = 0 + cdef int itemsize = npy_ary.itemsize dlmv_tensor = stdlib.malloc( sizeof(DLManagedTensorVersioned)) if dlmv_tensor is NULL: raise MemoryError( - "to_dlpack_versioned_capsule: Could not allocate memory " + "numpy_to_dlpack_versioned_capsule: Could not allocate memory " "for DLManagedTensorVersioned" ) - shape_strides_ptr = stdlib.malloc((sizeof(int64_t) * 2) * nd) + + is_c_contiguous = npy_ary.flags["C"] + shape = npy_ary.ctypes.shape_as(ctypes.c_int64) + strides = npy_ary.ctypes.strides_as(ctypes.c_int64) + if not is_c_contiguous: + if npy_ary.size != 1: + for i in range(nd): + if shape[i] != 1 and strides[i] % itemsize != 0: + stdlib.free(dlmv_tensor) + raise BufferError( + "numpy_to_dlpack_versioned_capsule: DLPack cannot encode " + "an array if strides are not a multiple of itemsize" + ) + shape_strides_ptr = stdlib.malloc((sizeof(int64_t) * 2) * nd) + else: + # no need to pass strides in this case + shape_strides_ptr = stdlib.malloc(sizeof(int64_t) * nd) if shape_strides_ptr is NULL: stdlib.free(dlmv_tensor) raise MemoryError( - "to_dlpack_versioned_capsule: Could not allocate memory " + "numpy_to_dlpack_versioned_capsule: Could not allocate memory " "for shape/strides" ) - # this can be a separate function for handling shapes and strides - shape = npy_ary.ctypes.shape_as(ctypes.c_int64) - strides = npy_ary.ctypes.strides_as(ctypes.c_int64) for i in range(nd): shape_strides_ptr[i] = shape[i] - shape_strides_ptr[nd + i] = strides[i] // npy_ary.itemsize + if not is_c_contiguous: + shape_strides_ptr[nd + i] = strides[i] // itemsize + writable_flag = npy_ary.flags["W"] ary_dt = npy_ary.dtype @@ -546,7 +562,7 @@ cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied): dl_tensor.ndim = nd dl_tensor.byte_offset = byte_offset dl_tensor.shape = &shape_strides_ptr[0] - dl_tensor.strides = &shape_strides_ptr[nd] + dl_tensor.strides = &shape_strides_ptr[nd] if not is_c_contiguous else NULL dl_tensor.device.device_type = kDLCPU dl_tensor.device.device_id = 0 dl_tensor.dtype.lanes = 1 From f781400ef72f6de26b3b8eb4ba7b51625cfe0e1b Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 5 Aug 2024 16:20:44 -0700 Subject: [PATCH 09/18] Adds tests for DLPack using `kDLCPU` dl_device --- dpctl/tests/test_usm_ndarray_dlpack.py | 201 ++++++++++++++++++++++++- 1 file changed, 196 insertions(+), 5 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_dlpack.py b/dpctl/tests/test_usm_ndarray_dlpack.py index e8d7885b1a..344080e2ae 100644 --- a/dpctl/tests/test_usm_ndarray_dlpack.py +++ b/dpctl/tests/test_usm_ndarray_dlpack.py @@ -17,15 +17,17 @@ import collections import ctypes +import numpy as np import pytest -from helper import skip_if_dtype_not_supported +from helper import get_queue_or_skip, skip_if_dtype_not_supported import dpctl import dpctl.tensor as dpt import dpctl.tensor._dlpack as _dlp import dpctl.tensor._usmarray as dpt_arr -device_oneAPI = 14 # DLDeviceType.kDLOneAPI +device_CPU = dpt_arr.DLDeviceType.kDLCPU +device_oneAPI = dpt_arr.DLDeviceType.kDLOneAPI _usm_types_list = ["shared", "device", "host"] @@ -491,17 +493,206 @@ def test_dlpack_dl_device(): cap2 = x.__dlpack__(dl_device=(1, 0), max_version=max_supported_ver) assert _is_capsule(cap2) cap3 = x.__dlpack__( - dl_device=(dpt_arr.DLDeviceType.kDLCPU, 0), + dl_device=(device_CPU, 0), max_version=max_supported_ver, ) assert _is_capsule(cap3) cap4 = x.__dlpack__(dl_device=("kDLCPU", 0), max_version=max_supported_ver) assert _is_capsule(cap4) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): # pass method instead of return of its __call__ invocation x.__dlpack__( dl_device=x.__dlpack_device__, max_version=max_supported_ver ) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): # exercise check for length x.__dlpack__(dl_device=(3,), max_version=max_supported_ver) + + +def test_from_dlpack_kdlcpu_interop_numpy(): + """ + Basic test that usm_ndarray can interoperate with NumPy ndarray + `__dlpack_device__`. + """ + get_queue_or_skip() + + sh = 5 + dt = dpt.int32 + + X = dpt.empty(sh, dtype=dt) + dl_device_np = np.empty(()).__dlpack_device__() + + Y = dpt.from_dlpack(X, device=dl_device_np) + assert isinstance(Y, np.ndarray) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + + V = dpt.from_dlpack(Y) + assert isinstance(V, np.ndarray) + assert Y.shape == V.shape + assert Y.dtype == V.dtype + + +@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)]) +def test_from_dlpack_to_kdlcpu(shape, typestr): + q = get_queue_or_skip() + skip_if_dtype_not_supported(typestr, q.sycl_device) + + X = dpt.empty(shape, dtype=typestr, sycl_queue=q) + Y = dpt.from_dlpack(X, device=(device_CPU, 0)) + assert isinstance(Y, np.ndarray) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + # NumPy does not treat size 0 arrays consistently + # w.r.t. strides, so skip these cases + if X.ndim and X.size != 0: + V = Y[::-1] + W = dpt.from_dlpack(V) + assert V.strides == W.strides + + +@pytest.mark.parametrize("mod", [2, 5]) +def test_from_dlpack_to_kdlcpu_strides(mod, typestr): + q = get_queue_or_skip() + skip_if_dtype_not_supported(typestr, q.sycl_device) + + X0 = dpt.empty(3 * mod, dtype=typestr, sycl_queue=q) + for start in range(mod): + X = X0[slice(-start - 1, None, -mod)] + Y = dpt.from_dlpack(X, device=(device_CPU, 0)) + assert X.shape == Y.shape + assert X.dtype == Y.dtype + if Y.ndim: + V = Y[::-1] + W = dpt.from_dlpack(V) + assert V.strides == W.strides + + +def test_dlpack_from_subdevice_to_kdlcpu(): + """ + Check that array allocated on a sub-device can be + imported via DLPack to kDLCPU device (as a NumPy array). + """ + n = 64 + try: + dev = dpctl.SyclDevice() + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + try: + sdevs = dev.create_sub_devices(partition="next_partitionable") + except dpctl.SyclSubDeviceCreationError: + sdevs = None + try: + if sdevs is None: + sdevs = dev.create_sub_devices(partition=[1, 1]) + except dpctl.SyclSubDeviceCreationError: + pytest.skip("Default device can not be partitioned") + assert isinstance(sdevs, list) and len(sdevs) > 0 + try: + ctx = sdevs[0].sycl_platform.default_context + except dpctl.SyclContextCreationError: + pytest.skip("Platform's default_context is not available") + try: + q = dpctl.SyclQueue(ctx, sdevs[0]) + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + + ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q) + ar2 = dpt.from_dlpack(ar, dl_device=(device_CPU, 0)) + assert isinstance(ar2, np.ndarray) + + +def test_legacy_dlpack_capsule_from_numpy(): + """ + Check that NumPy's exported legacy DLPack capsule + will interoperate with from_dlpack_capsule, + especially with zero-copy. + """ + x = np.arange(100, dtype="i4") + cap = x.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x.ctypes.data == y.ctypes.data + + x = np.arange(100, dtype="u4").reshape((10, 10)).T + cap = x.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x.ctypes.data == y.ctypes.data + del x + + x = np.arange(100, dtype="f4").reshape((10, 10), order="F") + cap = x.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + del cap + assert x.ctypes.data == y.ctypes.data + + x = np.arange(100, dtype="c8") + x1 = x[::-2] + cap = x1.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + assert x1.ctypes.data == y.ctypes.data + del x1, y, x + del cap + + x = np.ones(100, dtype="?") + x1 = x[::-2] + cap = x1.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + assert x1.ctypes.data == y.ctypes.data + del x1, y, x + del cap + + +def test_dlpack_capsule_readonly_array_to_kdlcpu(): + try: + x = dpt.arange(100, dtype="i4") + except dpctl.SyclDeviceCreationError: + pytest.skip("No default device available") + + max_supported_ver = _dlp.get_build_dlpack_version() + # read-only array + x.flags["W"] = False + cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0)) + y = _dlp.from_dlpack_capsule(cap) + assert dpt.all(x == dpt.asarray(y)) + assert not y.flags["W"] + + cap1 = _dlp.numpy_to_dlpack_versioned_capsule(y, not y.flags["W"]) + y1 = _dlp.from_dlpack_capsule(cap1) + assert not y1.flags["W"] + + +def test_used_dlpack_capsule_from_numpy(): + get_queue_or_skip() + + x_np = np.arange(100, dtype="i4") + + cap = x_np.__dlpack__() + _dlp.from_dlpack_capsule(cap) + with pytest.raises( + ValueError, + match="A DLPack tensor object can not be consumed multiple times", + ): + _dlp.from_dlpack_capsule(cap) + del cap + + x = dpt.asarray(x_np) + max_supported_ver = _dlp.get_build_dlpack_version() + cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0)) + _dlp.from_dlpack_capsule(cap) + with pytest.raises( + ValueError, + match="A DLPack tensor object can not be consumed multiple times", + ): + _dlp.from_dlpack_capsule(cap) + del cap + + +def test_dlpack_size_0_on_kdlcpu(): + get_queue_or_skip() + x_np = np.ones(0, dtype="i4") + + cap = x_np.__dlpack__() + y = _dlp.from_dlpack_capsule(cap) + assert y.ctypes.data == x_np.ctypes.data From 1544e9b3b7b08742b922e78631e35311c1f3a128 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 5 Aug 2024 17:20:37 -0700 Subject: [PATCH 10/18] Clean up errors in `__dlpack__` --- dpctl/tensor/_usmarray.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index ae7f68e51f..2984159261 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1161,7 +1161,7 @@ cdef class usm_ndarray: raise TypeError( "`__dlpack__` expects `max_version` to be a " "2-tuple of integers `(major, minor)`, instead " - f"got {type(max_version)}" + f"got {max_version}" ) dpctl_dlpack_version = get_build_dlpack_version() if max_version[0] >= dpctl_dlpack_version[0]: @@ -1169,9 +1169,9 @@ cdef class usm_ndarray: if dl_device is not None: if not isinstance(dl_device, tuple) or len(dl_device) != 2: raise TypeError( - "`__dlpack__` expects `dl_device` to be a " - "2-tuple of `(device_type, device_id)`, instead " - f"got {type(dl_device)}" + "`__dlpack__` expects `dl_device` to be a 2-tuple " + "of `(device_type, device_id)`, instead " + f"got {dl_device}" ) if dl_device != self.__dlpack_device__(): if copy == False: From 70c67729ff2eed07edb3155814aaea9884a48776 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 5 Aug 2024 20:56:35 -0700 Subject: [PATCH 11/18] Re-order conditional strides assignment Avoids possible out-of-bounds access by short-circuiting the if statement --- dpctl/tensor/_dlpack.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index 7136761918..80fc1e207c 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -562,7 +562,10 @@ cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied): dl_tensor.ndim = nd dl_tensor.byte_offset = byte_offset dl_tensor.shape = &shape_strides_ptr[0] - dl_tensor.strides = &shape_strides_ptr[nd] if not is_c_contiguous else NULL + if is_c_contiguous: + dl_tensor.strides = NULL + else: + dl_tensor.strides = &shape_strides_ptr[nd] dl_tensor.device.device_type = kDLCPU dl_tensor.device.device_id = 0 dl_tensor.dtype.lanes = 1 From fe71e9947356c63be9eed12ea3a129442f8808ef Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 6 Aug 2024 15:33:54 -0700 Subject: [PATCH 12/18] Prevent losing reference of DLPack holder when returning NumPy array Passing the used Python capsule to `_numpy_array_interface_wrapper` would cause premature deallocation of the newly created DLPack owner object, causing the dl_tensor to be prematurely deleted Now pass the new owner object instead --- dpctl/tensor/_dlpack.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index 80fc1e207c..dca6b5b2e3 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -713,9 +713,9 @@ class _numpy_array_interface_wrapper: converted to numpy. """ - def __init__(self, array_interface, pycapsule) -> None: + def __init__(self, array_interface, memory_owner) -> None: self.__array_interface__ = array_interface - self._pycapsule = pycapsule + self._memory_owner = memory_owner cpdef object from_dlpack_capsule(object py_caps): @@ -920,10 +920,11 @@ cpdef object from_dlpack_capsule(object py_caps): if not versioned: dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) cpython.PyCapsule_SetName(py_caps, 'used_dltensor') + return np.ctypeslib.as_array(_numpy_array_interface_wrapper(ary_iface, dlm_holder)) else: dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor) cpython.PyCapsule_SetName(py_caps, 'used_dltensor_versioned') - return np.ctypeslib.as_array(_numpy_array_interface_wrapper(ary_iface, py_caps)) + return np.ctypeslib.as_array(_numpy_array_interface_wrapper(ary_iface, dlmv_holder)) else: raise BufferError( "The DLPack tensor resides on unsupported device." From 85d417fc41ec5491d8ed87325169d2bf07180107 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 7 Aug 2024 19:04:17 -0700 Subject: [PATCH 13/18] Expose `DLDeviceType` in `dpctl.tensor` --- dpctl/tensor/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index dff75b9c2c..c6fb8f21ea 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -93,7 +93,7 @@ from dpctl.tensor._reshape import reshape from dpctl.tensor._search_functions import where from dpctl.tensor._statistical_functions import mean, std, var -from dpctl.tensor._usmarray import usm_ndarray +from dpctl.tensor._usmarray import DLDeviceType, usm_ndarray from dpctl.tensor._utility_functions import all, any, diff from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum @@ -376,4 +376,5 @@ "nextafter", "diff", "count_nonzero", + "DLDeviceType", ] From 5ae872a986cade261467722ee62adf0cef59a58d Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 7 Aug 2024 19:04:41 -0700 Subject: [PATCH 14/18] Docstring for DLDeviceType listing the valid enumerators --- dpctl/tensor/_usmarray.pyx | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 2984159261..f9d8278864 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -54,6 +54,40 @@ include "_slicing.pxi" class DLDeviceType(IntEnum): + """ + An ``IntEnum`` for the types of DLDevices supported by the DLPack + protocol. + ``kDLCPU``: + CPU (host) device + ``kDLCUDA``: + CUDA GPU device + ``kDLCUDAHost``: + Pinned CUDA CPU memory by cudaMallocHost + ``kDLOpenCL``: + OpenCL device + ``kDLVulkan``: + Vulkan buffer + ``kDLMetal``: + Metal for Apple GPU + ``kDLVPI``: + Verilog simulator buffer + ``kDLROCM``: + ROCm GPU device + ``kDLROCMHost``: + Pinned ROCm CPU memory allocated by hipMallocHost + ``kDLExtDev``: + Reserved extension device type used to test new devices + ``kDLCUDAManaged``: + CUDA managed/unified memory allocated by cudaMallocManaged + ``kDLOneAPI``: + Unified shared memory allocated on a oneAPI non-partitioned device + ``kDLWebGPU``: + Device support for WebGPU standard + ``kDLHexagon``: + Qualcomm Hexagon DSP + ``kDLMAIA``: + Microsoft MAIA device + """ kDLCPU = c_dlpack.device_CPU kDLCUDA = c_dlpack.device_CUDA kDLCUDAHost = c_dlpack.device_CUDAHost From 353be0d8ef3b66703d3a2474bc9dc23e316e553e Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 7 Aug 2024 19:08:30 -0700 Subject: [PATCH 15/18] Add documentation for constants and `DLDeviceType` enums --- .../api_reference/dpctl/tensor.constants.rst | 35 +++++++++++++++++++ .../api_reference/dpctl/tensor.rst | 2 ++ dpctl/tensor/_usmarray.pyx | 2 ++ 3 files changed, 39 insertions(+) create mode 100644 docs/doc_sources/api_reference/dpctl/tensor.constants.rst diff --git a/docs/doc_sources/api_reference/dpctl/tensor.constants.rst b/docs/doc_sources/api_reference/dpctl/tensor.constants.rst new file mode 100644 index 0000000000..2cb9f770d2 --- /dev/null +++ b/docs/doc_sources/api_reference/dpctl/tensor.constants.rst @@ -0,0 +1,35 @@ +.. _dpctl_tensor_constants: + +Constants +======================== + +The following constants are defined in :py:mod:`dpctl.tensor`: + +.. currentmodule:: dpctl.tensor + +.. autodata:: DLDeviceType + +.. data:: e + + ``float``: + IEEE 754 floating-point representation of Euler's constant. + +.. data:: inf + + ``float``: + IEEE 754 floating-point representation of (positive) infinity. + +.. data:: nan + + ``float``: + IEEE 754 floating-point representation of Not a Number (NaN). + +.. data:: newaxis + + ``NoneType``: + Alias for ``None`` which is useful for indexing. + +.. data:: pi + + ``float``: + IEEE 754 floating-point representation of the mathematical constant π. diff --git a/docs/doc_sources/api_reference/dpctl/tensor.rst b/docs/doc_sources/api_reference/dpctl/tensor.rst index d2aaa6fbc4..10e1f65d9f 100644 --- a/docs/doc_sources/api_reference/dpctl/tensor.rst +++ b/docs/doc_sources/api_reference/dpctl/tensor.rst @@ -29,6 +29,7 @@ This module contains: * :ref:`sorting functions ` * :ref:`statistical functions ` * :ref:`utility functions ` +* :ref:`constants ` .. toctree:: @@ -48,3 +49,4 @@ This module contains: tensor.sorting_functions tensor.statistical_functions tensor.utility_functions + tensor.constants diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index f9d8278864..b1a428cbb4 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1278,6 +1278,8 @@ cdef class usm_ndarray: The tuple describes the non-partitioned device where the array has been allocated, or the non-partitioned parent device of the allocation device. + See ``DLDeviceType`` for a list of devices supported by the DLPack protocol. + Raises: DLPackCreationError: when the ``device_id`` could not be determined. From 4c360d3ded8570c7c2bf15f68a26883fdeb51d44 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 8 Aug 2024 20:37:51 -0700 Subject: [PATCH 16/18] Add _is_kdlcpu_device utility to _dlpack.pyx Simplifies check for device_type = kDLCPU and device_id = 0 --- dpctl/tensor/_dlpack.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index dca6b5b2e3..7cd6c26e25 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -718,6 +718,11 @@ class _numpy_array_interface_wrapper: self._memory_owner = memory_owner +cdef bint _is_kdlcpu_device(DLDevice *dev): + "Check if DLTensor.DLDevice denotes (kDLCPU, 0)" + return (dev.device_type == kDLCPU) and (dev.device_id == 0) + + cpdef object from_dlpack_capsule(object py_caps): """ from_dlpack_capsule(py_caps) @@ -915,7 +920,7 @@ cpdef object from_dlpack_capsule(object py_caps): if readonly: res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE) return res_ary - elif dl_tensor.device.device_type == kDLCPU: + elif _is_kdlcpu_device(&dl_tensor.device): ary_iface = _numpy_array_interface_from_dl_tensor(dl_tensor, readonly) if not versioned: dlm_holder = _DLManagedTensorOwner._create(dlm_tensor) From dd4c0c0fc8ff3625415d646aa177f6685d379caf Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 9 Aug 2024 09:14:09 -0700 Subject: [PATCH 17/18] Minor change to `_is_kdlcpu_device` Better fits Cython style by using array-index-access to explicitly dereference pointer rather than relying on Cython to implicitly dereference it in code generation --- dpctl/tensor/_dlpack.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index 7cd6c26e25..4741854a84 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -720,7 +720,7 @@ class _numpy_array_interface_wrapper: cdef bint _is_kdlcpu_device(DLDevice *dev): "Check if DLTensor.DLDevice denotes (kDLCPU, 0)" - return (dev.device_type == kDLCPU) and (dev.device_id == 0) + return (dev[0].device_type == kDLCPU) and (dev[0].device_id == 0) cpdef object from_dlpack_capsule(object py_caps): From 2661f517848c9c2e651880886c4ec0bdc262cb68 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 7 Aug 2024 08:55:30 -0500 Subject: [PATCH 18/18] Support copy-via-host in from_dlpack For arr that supports DLPack, version (1, 0), or legacy, support ``` from_dlpack(arr, device=target_dev) ``` where target_dev is `(kDLCPU, 0)` for transfer to host, or a value recognized by device keywords in dpctl.tensor for other functions, or `(kDLOneAPI, dev_id)`. To support transfer via host, `arr` must support `__dlpack__(max_version=(1,0), dl_device=(1, 0))`. For array objects with legacy `__dlpack__` support only, supported inputs are those residing on kDLCPU device, or those from kDLOneAPI device only. --- This is a combination of 17 commits squashed into one: Combine two validation checks into one, improving coverage Only fall-back to __dlpack__() if requested device does not change Simplify branching, only fall-back to no-arg call to __dlpack__ is dl_device is None or same as reported for the input Changed from_dlpack to copy via host is needed This enables dpt.from_dlpack(numpy_array, device="opencl:cpu") Add a test to exercise copy via host Handle possibilities for TypeError and BufferError These may be hard to test Change exception raised by __dlpack__ if dl_device is unsupported It used to raise NotImplementedError, not raises BufferError Add case of dlpack test to expand coverage Removed comment, add NotImplementedError to the except clause To ensure same validation across branches, compute host_blob by roundtripping it through dlpack Test from_dlpack on numpy input with strides not multiple of elementsize Refined from_dlpack docstrings, reorged impl of from_dlpack Used try/except/else/finally to avoid raising an exception when another one is in flight (confusing UX). device keyword is only allowed to be (kDLCPU, 0) or (kDLOneAPI, num). Device keyword value is used to create output array, rather than device_id deduced from it. Adjusted test per change in implementation Expand applicability of fall-back behavior When `from_dlpack(arr, device=dev)` is called, for `arr` object that supports legacy DLPack interface (max_version, dl_device, copy are not supported), we now support arr being device on host, that is (kDLCPU, 0), and (kDLOneAPI, different_device_id). Support for this last case is being added in this commit, as per review comment. Add symmetric support for containers with legacy DLPack support For legacy containers, support device=(kDLCPU, 0) as well as oneAPI device. Add tests for importing generic legacy and generic modern containers Fix typos in comments Add test for legacy container holding numpy's array. --- dpctl/tensor/_dlpack.pyx | 178 +++++++++++++++++++++---- dpctl/tensor/_usmarray.pyx | 2 +- dpctl/tests/test_usm_ndarray_dlpack.py | 129 ++++++++++++++++++ 3 files changed, 281 insertions(+), 28 deletions(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index 4741854a84..098003ead2 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -168,7 +168,7 @@ cdef void _managed_tensor_versioned_deleter(DLManagedTensorVersioned *dlmv_tenso stdlib.free(dlmv_tensor) -cdef object _get_default_context(c_dpctl.SyclDevice dev) except *: +cdef object _get_default_context(c_dpctl.SyclDevice dev): try: default_context = dev.sycl_platform.default_context except RuntimeError: @@ -178,7 +178,7 @@ cdef object _get_default_context(c_dpctl.SyclDevice dev) except *: return default_context -cdef int get_parent_device_ordinal_id(c_dpctl.SyclDevice dev) except *: +cdef int get_parent_device_ordinal_id(c_dpctl.SyclDevice dev) except -1: cdef DPCTLSyclDeviceRef pDRef = NULL cdef DPCTLSyclDeviceRef tDRef = NULL cdef c_dpctl.SyclDevice p_dev @@ -201,7 +201,7 @@ cdef int get_parent_device_ordinal_id(c_dpctl.SyclDevice dev) except *: cdef int get_array_dlpack_device_id( usm_ndarray usm_ary -) except *: +) except -1: """Finds ordinal number of the parent of device where array was allocated. """ @@ -935,6 +935,32 @@ cpdef object from_dlpack_capsule(object py_caps): "The DLPack tensor resides on unsupported device." ) +cdef usm_ndarray _to_usm_ary_from_host_blob(object host_blob, dev : Device): + q = dev.sycl_queue + np_ary = np.asarray(host_blob) + dt = np_ary.dtype + if dt.char in "dD" and q.sycl_device.has_aspect_fp64 is False: + Xusm_dtype = ( + "float32" if dt.char == "d" else "complex64" + ) + else: + Xusm_dtype = dt + usm_mem = dpmem.MemoryUSMDevice(np_ary.nbytes, queue=q) + usm_ary = usm_ndarray(np_ary.shape, dtype=Xusm_dtype, buffer=usm_mem) + usm_mem.copy_from_host(np.reshape(np_ary.view(dtype="u1"), -1)) + return usm_ary + + +# only cdef to make it private +cdef object _create_device(object device, object dl_device): + if isinstance(device, Device): + return device + elif isinstance(device, dpctl.SyclDevice): + return Device.create_device(device) + else: + root_device = dpctl.SyclDevice(str(dl_device[1])) + return Device.create_device(root_device) + def from_dlpack(x, /, *, device=None, copy=None): """ from_dlpack(x, /, *, device=None, copy=None) @@ -943,7 +969,7 @@ def from_dlpack(x, /, *, device=None, copy=None): object ``x`` that implements ``__dlpack__`` protocol. Args: - x (Python object): + x (object): A Python object representing an array that supports ``__dlpack__`` protocol. device (Optional[str, @@ -959,7 +985,8 @@ def from_dlpack(x, /, *, device=None, copy=None): returned by :attr:`dpctl.tensor.usm_ndarray.device`, or a 2-tuple matching the format of the output of the ``__dlpack_device__`` method, an integer enumerator representing the device type followed by - an integer representing the index of the device. + an integer representing the index of the device. The only supported + :enum:`dpctl.tensor.DLDeviceType` types are "kDLCPU" and "kDLOneAPI". Default: ``None``. copy (bool, optional) Boolean indicating whether or not to copy the input. @@ -1008,33 +1035,130 @@ def from_dlpack(x, /, *, device=None, copy=None): C = Container(dpt.linspace(0, 100, num=20, dtype="int16")) X = dpt.from_dlpack(C) + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) """ - if not hasattr(x, "__dlpack__"): - raise TypeError( - f"The argument of type {type(x)} does not implement " - "`__dlpack__` method." - ) - dlpack_attr = getattr(x, "__dlpack__") - if not callable(dlpack_attr): + dlpack_attr = getattr(x, "__dlpack__", None) + dlpack_dev_attr = getattr(x, "__dlpack_device__", None) + if not callable(dlpack_attr) or not callable(dlpack_dev_attr): raise TypeError( f"The argument of type {type(x)} does not implement " - "`__dlpack__` method." + "`__dlpack__` and `__dlpack_device__` methods." ) - try: - # device is converted to a dlpack_device if necessary - dl_device = None - if device: - if isinstance(device, tuple): - dl_device = device + # device is converted to a dlpack_device if necessary + dl_device = None + if device: + if isinstance(device, tuple): + dl_device = device + if len(dl_device) != 2: + raise ValueError( + "Argument `device` specified as a tuple must have length 2" + ) + else: + if not isinstance(device, dpctl.SyclDevice): + device = Device.create_device(device) + d = device.sycl_device else: - if not isinstance(device, dpctl.SyclDevice): - d = Device.create_device(device).sycl_device - dl_device = (device_OneAPI, get_parent_device_ordinal_id(d)) - else: - dl_device = (device_OneAPI, get_parent_device_ordinal_id(device)) - dlpack_capsule = dlpack_attr(max_version=get_build_dlpack_version(), dl_device=dl_device, copy=copy) - return from_dlpack_capsule(dlpack_capsule) + d = device + dl_device = (device_OneAPI, get_parent_device_ordinal_id(d)) + if dl_device is not None: + if (dl_device[0] not in [device_OneAPI, device_CPU]): + raise ValueError( + f"Argument `device`={device} is not supported." + ) + got_type_error = False + got_buffer_error = False + got_other_error = False + saved_exception = None + # First DLPack version supporting dl_device, and copy + requested_ver = (1, 0) + cpu_dev = (device_CPU, 0) + try: + # setting max_version to minimal version that supports dl_device/copy keywords + dlpack_capsule = dlpack_attr( + max_version=requested_ver, + dl_device=dl_device, + copy=copy + ) except TypeError: - dlpack_capsule = dlpack_attr() + # exporter does not support max_version keyword + got_type_error = True + except (BufferError, NotImplementedError): + # Either dl_device, or copy can be satisfied + got_buffer_error = True + except Exception as e: + got_other_error = True + saved_exception = e + else: + # execution did not raise exceptions return from_dlpack_capsule(dlpack_capsule) + finally: + if got_type_error: + # max_version/dl_device, copy keywords are not supported by __dlpack__ + x_dldev = dlpack_dev_attr() + if (dl_device is None) or (dl_device == x_dldev): + dlpack_capsule = dlpack_attr() + return from_dlpack_capsule(dlpack_capsule) + # must copy via host + if copy is False: + raise BufferError( + "Importing data via DLPack requires copying, but copy=False was provided" + ) + # when max_version/dl_device/copy are not supported + # we can only support importing to OneAPI devices + # from host, or from another oneAPI device + is_supported_x_dldev = ( + x_dldev == cpu_dev or + (x_dldev[0] == device_OneAPI) + ) + is_supported_dl_device = ( + dl_device == cpu_dev or + dl_device[0] == device_OneAPI + ) + if is_supported_x_dldev and is_supported_dl_device: + dlpack_capsule = dlpack_attr() + blob = from_dlpack_capsule(dlpack_capsule) + else: + raise BufferError(f"Can not import to requested device {dl_device}") + dev = _create_device(device, dl_device) + if x_dldev == cpu_dev and dl_device == cpu_dev: + # both source and destination are CPU + return blob + elif x_dldev == cpu_dev: + # source is CPU, destination is oneAPI + return _to_usm_ary_from_host_blob(blob, dev) + elif dl_device == cpu_dev: + # source is oneAPI, destination is CPU + cpu_caps = blob.__dlpack__( + max_version=get_build_dlpack_version(), + dl_device=cpu_dev + ) + return from_dlpack_capsule(cpu_caps) + else: + import dpctl.tensor as dpt + return dpt.asarray(blob, device=dev) + elif got_buffer_error: + # we are here, because dlpack_attr could not deal with requested dl_device, + # or copying was required + if copy is False: + raise BufferError( + "Importing data via DLPack requires copying, but copy=False was provided" + ) + # must copy via host + if dl_device[0] != device_OneAPI: + raise BufferError(f"Can not import to requested device {dl_device}") + x_dldev = dlpack_dev_attr() + if x_dldev == cpu_dev: + dlpack_capsule = dlpack_attr() + host_blob = from_dlpack_capsule(dlpack_capsule) + else: + dlpack_capsule = dlpack_attr( + max_version=requested_ver, + dl_device=cpu_dev, + copy=copy + ) + host_blob = from_dlpack_capsule(dlpack_capsule) + dev = _create_device(device, dl_device) + return _to_usm_ary_from_host_blob(host_blob, dev) + elif got_other_error: + raise saved_exception diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index cbe164c6d3..e806dcc956 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1242,7 +1242,7 @@ cdef class usm_ndarray: _arr.flags["W"] = self.flags["W"] return c_dlpack.numpy_to_dlpack_versioned_capsule(_arr, True) else: - raise NotImplementedError( + raise BufferError( f"targeting `dl_device` {dl_device} with `__dlpack__` is not " "yet implemented" ) diff --git a/dpctl/tests/test_usm_ndarray_dlpack.py b/dpctl/tests/test_usm_ndarray_dlpack.py index 344080e2ae..2f07abf12a 100644 --- a/dpctl/tests/test_usm_ndarray_dlpack.py +++ b/dpctl/tests/test_usm_ndarray_dlpack.py @@ -696,3 +696,132 @@ def test_dlpack_size_0_on_kdlcpu(): cap = x_np.__dlpack__() y = _dlp.from_dlpack_capsule(cap) assert y.ctypes.data == x_np.ctypes.data + + +def test_copy_via_host(): + get_queue_or_skip() + x = dpt.ones(1, dtype="i4") + x_np = np.ones(1, dtype="i4") + x_dl_dev = x.__dlpack_device__() + y = dpt.from_dlpack(x_np, device=x_dl_dev) + assert isinstance(y, dpt.usm_ndarray) + assert y.sycl_device == x.sycl_device + assert y.usm_type == "device" + + with pytest.raises(ValueError): + # uncorrect length of tuple + dpt.from_dlpack(x_np, device=(1, 0, 0)) + with pytest.raises(ValueError): + # only kDLCPU and kDLOneAPI are supported + dpt.from_dlpack(x, device=(2, 0)) + + num_devs = dpctl.get_num_devices() + if num_devs > 1: + j = [i for i in range(num_devs) if i != x_dl_dev[1]][0] + z = dpt.from_dlpack(x, device=(x_dl_dev[0], j)) + assert isinstance(z, dpt.usm_ndarray) + assert z.usm_type == "device" + + +def test_copy_via_host_gh_1789(): + "Test based on review example from gh-1789" + get_queue_or_skip() + x_np = np.ones((10, 10), dtype="i4") + # strides are no longer multiple of itemsize + x_np.strides = (x_np.strides[0] - 1, x_np.strides[1]) + with pytest.raises(BufferError): + dpt.from_dlpack(x_np) + with pytest.raises(BufferError): + dpt.from_dlpack(x_np, device=(14, 0)) + + +class LegacyContainer: + "Helper class implementing legacy `__dlpack__` protocol" + + def __init__(self, array): + self._array = array + + def __dlpack__(self, stream=None): + return self._array.__dlpack__(stream=stream) + + def __dlpack_device__(self): + return self._array.__dlpack_device__() + + +class Container: + "Helper class implementing legacy `__dlpack__` protocol" + + def __init__(self, array): + self._array = array + + def __dlpack__( + self, max_version=None, dl_device=None, copy=None, stream=None + ): + return self._array.__dlpack__( + max_version=max_version, + dl_device=dl_device, + copy=copy, + stream=stream, + ) + + def __dlpack_device__(self): + return self._array.__dlpack_device__() + + +def test_generic_container_legacy(): + get_queue_or_skip() + C = LegacyContainer(dpt.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, dpt.usm_ndarray) + assert X._pointer == C._array._pointer + assert X.sycl_device == C._array.sycl_device + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + Z = dpt.from_dlpack(C, device=X.device) + assert isinstance(Z, dpt.usm_ndarray) + assert Z._pointer == X._pointer + assert Z.device == X.device + + +def test_generic_container_legacy_np(): + get_queue_or_skip() + C = LegacyContainer(np.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, np.ndarray) + assert X.ctypes.data == C._array.ctypes.data + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + dev = dpt.Device.create_device() + Z = dpt.from_dlpack(C, device=dev) + assert isinstance(Z, dpt.usm_ndarray) + assert Z.device == dev + + +def test_generic_container(): + get_queue_or_skip() + C = Container(dpt.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, dpt.usm_ndarray) + assert X._pointer == C._array._pointer + assert X.sycl_device == C._array.sycl_device + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + Z = dpt.from_dlpack(C, device=X.device) + assert isinstance(Z, dpt.usm_ndarray) + assert Z._pointer == X._pointer + assert Z.device == X.device