@@ -132,8 +132,8 @@ class CopyAsCContigFunctor
132
132
133
133
template <typename T,
134
134
typename IndexerT,
135
- std::uint32_t n_vecs,
136
135
std::uint32_t vec_sz,
136
+ std::uint32_t n_vecs,
137
137
bool enable_sg_load,
138
138
typename KernelName>
139
139
sycl::event submit_c_contiguous_copy (sycl::queue &exec_q,
@@ -143,6 +143,10 @@ sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
143
143
const IndexerT &src_indexer,
144
144
const std::vector<sycl::event> &depends)
145
145
{
146
+ static_assert (vec_sz > 0 );
147
+ static_assert (n_vecs > 0 );
148
+ static_assert (vec_sz * n_vecs < (std::uint32_t (1 ) << 8 ));
149
+
146
150
constexpr std::size_t preferred_lws = 256 ;
147
151
148
152
const auto &kernel_id = sycl::get_kernel_id<KernelName>();
@@ -206,8 +210,8 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
206
210
using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
207
211
const IndexerT src_indexer (nd, ssize_t (0 ), shape_and_strides);
208
212
209
- constexpr std::uint32_t n_vecs = 2 ;
210
- constexpr std::uint32_t vec_sz = 4 ;
213
+ constexpr std::uint32_t vec_sz = 4u ;
214
+ constexpr std::uint32_t n_vecs = 2u ;
211
215
212
216
using dpctl::tensor::kernels::alignment_utils::
213
217
disabled_sg_loadstore_wrapper_krn;
@@ -219,7 +223,7 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
219
223
constexpr bool enable_sg_load = true ;
220
224
using KernelName =
221
225
as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
222
- copy_ev = submit_c_contiguous_copy<T, IndexerT, n_vecs, vec_sz ,
226
+ copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs ,
223
227
enable_sg_load, KernelName>(
224
228
exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
225
229
}
@@ -228,7 +232,7 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
228
232
using InnerKernelName =
229
233
as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
230
234
using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
231
- copy_ev = submit_c_contiguous_copy<T, IndexerT, n_vecs, vec_sz ,
235
+ copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs ,
232
236
disable_sg_load, KernelName>(
233
237
exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
234
238
}
0 commit comments