24
24
// ===----------------------------------------------------------------------===//
25
25
26
26
#include " dpctl4pybind11.hpp"
27
+ #include < complex>
27
28
#include < cstdint>
28
29
#include < pybind11/numpy.h>
29
30
#include < pybind11/pybind11.h>
30
31
#include < pybind11/stl.h>
31
32
#include < sycl/sycl.hpp>
33
+ #include < utility>
32
34
#include < vector>
33
35
34
36
#include " elementwise_functions.hpp"
@@ -179,28 +181,31 @@ typedef sycl::event (*divide_by_scalar_fn_ptr_t)(
179
181
const ssize_t *,
180
182
const char *,
181
183
py::ssize_t ,
182
- std:: int64_t ,
184
+ const char * ,
183
185
char *,
184
186
py::ssize_t ,
185
187
const std::vector<sycl::event> &);
186
188
187
- template <typename T>
189
+ template <typename T, typename scalarT >
188
190
sycl::event divide_by_scalar (sycl::queue &exec_q,
189
191
size_t nelems,
190
192
int nd,
191
193
const ssize_t *shape_and_strides,
192
194
const char *arg_p,
193
195
py::ssize_t arg_offset,
194
- std:: int64_t scalar ,
196
+ const char *scalar_ptr ,
195
197
char *res_p,
196
198
py::ssize_t res_offset,
197
199
const std::vector<sycl::event> &depends = {})
198
200
{
201
+ const scalarT sc_v = *reinterpret_cast <const scalarT *>(scalar_ptr);
202
+
199
203
sycl::event comp_ev = exec_q.submit ([&](sycl::handler &cgh) {
200
204
cgh.depends_on (depends);
201
205
202
- using BinOpT = dpctl::tensor::kernels::true_divide::TrueDivideFunctor<
203
- T, std::int64_t , T>;
206
+ using BinOpT =
207
+ dpctl::tensor::kernels::true_divide::TrueDivideFunctor<T, scalarT,
208
+ T>;
204
209
205
210
auto op = BinOpT ();
206
211
@@ -220,15 +225,15 @@ sycl::event divide_by_scalar(sycl::queue &exec_q,
220
225
221
226
const auto &arg_i = two_offsets_.get_first_offset ();
222
227
const auto &res_i = two_offsets_.get_second_offset ();
223
- res_tp[res_i] = op (arg_tp[arg_i], scalar );
228
+ res_tp[res_i] = op (arg_tp[arg_i], sc_v );
224
229
});
225
230
});
226
231
return comp_ev;
227
232
}
228
233
229
234
std::pair<sycl::event, sycl::event>
230
235
py_divide_by_scalar (const dpctl::tensor::usm_ndarray &src,
231
- const std:: int64_t scalar,
236
+ double scalar,
232
237
const dpctl::tensor::usm_ndarray &dst,
233
238
sycl::queue &exec_q,
234
239
const std::vector<sycl::event> &depends = {})
@@ -293,18 +298,41 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
293
298
constexpr int float16_typeid = static_cast <int >(td_ns::typenum_t ::HALF);
294
299
constexpr int float32_typeid = static_cast <int >(td_ns::typenum_t ::FLOAT);
295
300
constexpr int float64_typeid = static_cast <int >(td_ns::typenum_t ::DOUBLE);
301
+ constexpr int complex64_typeid = static_cast <int >(td_ns::typenum_t ::CFLOAT);
302
+ constexpr int complex128_typeid =
303
+ static_cast <int >(td_ns::typenum_t ::CDOUBLE);
304
+
305
+ // statically pre-allocated memory for scalar
306
+ alignas (double ) char scalar_alloc[sizeof (double )] = {0 };
296
307
297
308
divide_by_scalar_fn_ptr_t fn;
298
309
switch (src_typeid) {
299
310
case float16_typeid:
300
- fn = divide_by_scalar<sycl::half>;
301
- break ;
311
+ {
312
+ fn = divide_by_scalar<sycl::half, sycl::half>;
313
+ std::ignore =
314
+ new (scalar_alloc) sycl::half (static_cast <sycl::half>(scalar));
315
+ } break ;
302
316
case float32_typeid:
303
- fn = divide_by_scalar<float >;
304
- break ;
317
+ {
318
+ fn = divide_by_scalar<float , float >;
319
+ std::ignore = new (scalar_alloc) float (scalar);
320
+ } break ;
305
321
case float64_typeid:
306
- fn = divide_by_scalar<double >;
307
- break ;
322
+ {
323
+ fn = divide_by_scalar<double , double >;
324
+ std::ignore = new (scalar_alloc) double (scalar);
325
+ } break ;
326
+ case complex64_typeid:
327
+ {
328
+ fn = divide_by_scalar<std::complex<float >, float >;
329
+ std::ignore = new (scalar_alloc) float (scalar);
330
+ } break ;
331
+ case complex128_typeid:
332
+ {
333
+ fn = divide_by_scalar<std::complex<double >, double >;
334
+ std::ignore = new (scalar_alloc) double (scalar);
335
+ } break ;
308
336
default :
309
337
throw std::runtime_error (" Implementation is missing for typeid=" +
310
338
std::to_string (src_typeid));
@@ -331,6 +359,16 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
331
359
simplified_shape, simplified_src_strides, simplified_dst_strides,
332
360
src_offset, dst_offset);
333
361
362
+ if (nd == 0 ) {
363
+ // handle 0d array as 1d array with 1 element
364
+ constexpr py::ssize_t one{1 };
365
+ simplified_shape.push_back (one);
366
+ simplified_src_strides.push_back (one);
367
+ simplified_dst_strides.push_back (one);
368
+ src_offset = 0 ;
369
+ dst_offset = 0 ;
370
+ }
371
+
334
372
using dpctl::tensor::offset_utils::device_allocate_and_pack;
335
373
const auto &ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t >(
336
374
exec_q, host_tasks, simplified_shape, simplified_src_strides,
@@ -349,8 +387,9 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
349
387
throw std::runtime_error (" Unable to allocate device memory" );
350
388
}
351
389
352
- sycl::event div_ev = fn (exec_q, src_nelems, nd, shape_strides, src_data,
353
- src_offset, scalar, dst_data, dst_offset, all_deps);
390
+ sycl::event div_ev =
391
+ fn (exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
392
+ scalar_alloc, dst_data, dst_offset, all_deps);
354
393
355
394
// async free of shape_strides temporary
356
395
auto ctx = exec_q.get_context ();
0 commit comments