Skip to content

Commit 1869f06

Browse files
Specialize copy_from_numpy_into_usm_ndarray
for contiguous case. Tensor implementation module temporarily exports both _copy_numpy_into_usm_ndarray_legacy, and _copy_numpy_into_usm_ndarray functions to performance comparison.
1 parent f483deb commit 1869f06

File tree

4 files changed

+413
-10
lines changed

4 files changed

+413
-10
lines changed

dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp

Lines changed: 106 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ class copy_cast_contig_kernel;
6161
template <typename srcT, typename dstT, typename IndexerT>
6262
class copy_cast_from_host_kernel;
6363

64+
template <typename srcT, typename dstT, typename IndexerT>
65+
class copy_cast_from_host_contig_kernel;
66+
6467
template <typename srcTy, typename dstTy> class Caster
6568
{
6669
public:
@@ -390,9 +393,9 @@ template <typename fnT, typename D, typename S> struct CopyAndCastContigFactory
390393
typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
391394
sycl::queue &,
392395
size_t,
393-
const std::array<ssize_t, 1>,
394-
const std::array<ssize_t, 1>,
395-
const std::array<ssize_t, 1>,
396+
const std::array<ssize_t, 1> &,
397+
const std::array<ssize_t, 1> &,
398+
const std::array<ssize_t, 1> &,
396399
const char *,
397400
ssize_t,
398401
char *,
@@ -406,9 +409,9 @@ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
406409
typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
407410
sycl::queue &,
408411
size_t,
409-
const std::array<ssize_t, 2>,
410-
const std::array<ssize_t, 2>,
411-
const std::array<ssize_t, 2>,
412+
const std::array<ssize_t, 2> &,
413+
const std::array<ssize_t, 2> &,
414+
const std::array<ssize_t, 2> &,
412415
const char *,
413416
ssize_t,
414417
char *,
@@ -448,9 +451,9 @@ template <typename dstTy, typename srcTy, int nd>
448451
sycl::event
449452
copy_and_cast_nd_specialized_impl(sycl::queue &q,
450453
size_t nelems,
451-
const std::array<ssize_t, nd> shape,
452-
const std::array<ssize_t, nd> src_strides,
453-
const std::array<ssize_t, nd> dst_strides,
454+
const std::array<ssize_t, nd> &shape,
455+
const std::array<ssize_t, nd> &src_strides,
456+
const std::array<ssize_t, nd> &dst_strides,
454457
const char *src_p,
455458
ssize_t src_offset,
456459
char *dst_p,
@@ -657,6 +660,100 @@ struct CopyAndCastFromHostFactory
657660
}
658661
};
659662

663+
typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)(
664+
sycl::queue &,
665+
size_t, /* nelems */
666+
const char *, /* src_pointer */
667+
ssize_t, /* src_offset */
668+
char *, /* dst_pointer */
669+
ssize_t, /* dst_offset */
670+
const std::vector<sycl::event> &);
671+
672+
/*!
673+
* @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
674+
* into usm_ndarray with elements of type `srcTy` for contiguous arrays.
675+
*
676+
* Function to cast and copy elements from numpy.ndarray specified by typeless
677+
* `host_src_p` and the `src_offset` given in the number of array elements.
678+
* Kernel dependencies are given by two vectors of
679+
* events: `depends` and `additional_depends`. The function execution is
680+
* complete at the return.
681+
*
682+
* @param q The queue where the routine should be executed.
683+
* @param nelems Number of elements to cast and copy.
684+
* @param src_stride The stride of source array in elements
685+
* @param dst_stride The stride of destimation array in elements
686+
* @param host_src_p Host (not USM allocated) pointer associated with the
687+
* source array.
688+
* @param src_offset Offset to the beginning of iteration in number of elements
689+
* of the source array from `host_src_p`.
690+
* @param dst_p USM pointer associated with the destination array.
691+
* @param dst_offset Offset to the beginning of iteration in number of elements
692+
* of the destination array from `dst_p`.
693+
* @param depends List of events to wait for before starting computations, if
694+
* any.
695+
*
696+
* @ingroup CopyAndCastKernels
697+
*/
698+
template <typename dstTy, typename srcTy>
699+
void copy_and_cast_from_host_contig_impl(
700+
sycl::queue &q,
701+
size_t nelems,
702+
const char *host_src_p,
703+
ssize_t src_offset,
704+
char *dst_p,
705+
ssize_t dst_offset,
706+
const std::vector<sycl::event> &depends)
707+
{
708+
dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
709+
dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
710+
711+
sycl::buffer<srcTy, 1> npy_buf(
712+
reinterpret_cast<const srcTy *>(host_src_p) + src_offset,
713+
sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}});
714+
715+
sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
716+
cgh.depends_on(depends);
717+
718+
sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
719+
720+
using IndexerT = TwoOffsets_CombinedIndexer<NoOpIndexer, NoOpIndexer>;
721+
constexpr NoOpIndexer src_indexer{};
722+
constexpr NoOpIndexer dst_indexer{};
723+
constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, dst_indexer};
724+
725+
dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p) + dst_offset;
726+
727+
cgh.parallel_for<
728+
copy_cast_from_host_contig_kernel<srcTy, dstTy, IndexerT>>(
729+
sycl::range<1>(nelems),
730+
GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
731+
Caster<srcTy, dstTy>, IndexerT>(
732+
npy_acc, dst_tp, indexer));
733+
});
734+
735+
// perform explicit synchronization. Implicit synchronization would be
736+
// performed by sycl::buffer destructor.
737+
copy_and_cast_from_host_ev.wait();
738+
739+
return;
740+
}
741+
742+
/*!
743+
* @brief Factory to get function pointer of type `fnT` for given NumPy array
744+
* source data type `S` and destination data type `D`.
745+
* @defgroup CopyAndCastKernels
746+
*/
747+
template <typename fnT, typename D, typename S>
748+
struct CopyAndCastFromHostContigFactory
749+
{
750+
fnT get()
751+
{
752+
fnT f = copy_and_cast_from_host_contig_impl<D, S>;
753+
return f;
754+
}
755+
};
756+
660757
// =============== Copying for reshape ================== //
661758

662759
template <typename Ty, typename SrcIndexerT, typename DstIndexerT>

0 commit comments

Comments
 (0)