@@ -1067,8 +1067,8 @@ sycl::event gemm_impl(sycl::queue &exec_q,
1067
1067
if (m == 1 ) {
1068
1068
constexpr size_t m_groups = 1 ;
1069
1069
size_t delta_k (4 );
1070
- size_t n_wi (64 );
1071
- size_t delta_n (16 );
1070
+ size_t n_wi (4 );
1071
+ size_t delta_n (4 );
1072
1072
1073
1073
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
1074
1074
local_mem_size, reserved_slm_size, delta_k,
@@ -1103,8 +1103,8 @@ sycl::event gemm_impl(sycl::queue &exec_q,
1103
1103
else if (k > n && k > m) {
1104
1104
constexpr size_t m_groups = 2 ;
1105
1105
size_t delta_k (4 );
1106
- size_t n_wi (64 );
1107
- size_t delta_n (16 );
1106
+ size_t n_wi (4 );
1107
+ size_t delta_n (4 );
1108
1108
1109
1109
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
1110
1110
local_mem_size, reserved_slm_size, delta_k,
@@ -1233,8 +1233,8 @@ sycl::event gemm_contig_impl(sycl::queue &exec_q,
1233
1233
if (m == 1 ) {
1234
1234
constexpr size_t m_groups = 1 ;
1235
1235
size_t delta_k (4 );
1236
- size_t n_wi (64 );
1237
- size_t delta_n (16 );
1236
+ size_t n_wi (4 );
1237
+ size_t delta_n (4 );
1238
1238
1239
1239
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
1240
1240
local_mem_size, reserved_slm_size, delta_k,
@@ -1269,8 +1269,8 @@ sycl::event gemm_contig_impl(sycl::queue &exec_q,
1269
1269
else if (k > n && k > m) {
1270
1270
constexpr size_t m_groups = 2 ;
1271
1271
size_t delta_k (4 );
1272
- size_t n_wi (64 );
1273
- size_t delta_n (16 );
1272
+ size_t n_wi (4 );
1273
+ size_t delta_n (4 );
1274
1274
1275
1275
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
1276
1276
local_mem_size, reserved_slm_size, delta_k,
@@ -1963,8 +1963,8 @@ sycl::event gemm_tree_impl(sycl::queue &exec_q,
1963
1963
// items in a column, so no need for allocating
1964
1964
// temp memory if only one group is needed
1965
1965
size_t delta_k (4 );
1966
- size_t n_wi (64 );
1967
- size_t delta_n (16 );
1966
+ size_t n_wi (4 );
1967
+ size_t delta_n (4 );
1968
1968
1969
1969
using dpctl::tensor::type_utils::is_complex;
1970
1970
if constexpr (!is_complex<resTy>::value) {
@@ -3394,8 +3394,8 @@ sycl::event gemm_contig_tree_impl(sycl::queue &exec_q,
3394
3394
// items in a column, so no need for allocating
3395
3395
// temp memory if only one group is needed
3396
3396
size_t delta_k (4 );
3397
- size_t n_wi (64 );
3398
- size_t delta_n (16 );
3397
+ size_t n_wi (4 );
3398
+ size_t delta_n (4 );
3399
3399
3400
3400
using dpctl::tensor::type_utils::is_complex;
3401
3401
if constexpr (!is_complex<resTy>::value) {
@@ -5462,8 +5462,8 @@ sycl::event gemm_batch_impl(sycl::queue &exec_q,
5462
5462
if (m == 1 ) {
5463
5463
constexpr int m_groups = 1 ;
5464
5464
size_t delta_k (4 );
5465
- size_t n_wi (32 );
5466
- size_t delta_n (16 );
5465
+ size_t n_wi (4 );
5466
+ size_t delta_n (4 );
5467
5467
5468
5468
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
5469
5469
local_mem_size, reserved_slm_size, delta_k,
@@ -5503,8 +5503,8 @@ sycl::event gemm_batch_impl(sycl::queue &exec_q,
5503
5503
else if (k > n && k > m) {
5504
5504
constexpr size_t m_groups = 2 ;
5505
5505
size_t delta_k (4 );
5506
- size_t n_wi (32 );
5507
- size_t delta_n (16 );
5506
+ size_t n_wi (4 );
5507
+ size_t delta_n (4 );
5508
5508
5509
5509
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
5510
5510
local_mem_size, reserved_slm_size, delta_k,
@@ -5664,8 +5664,8 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q,
5664
5664
if (m == 1 ) {
5665
5665
constexpr int m_groups = 1 ;
5666
5666
size_t delta_k (4 );
5667
- size_t n_wi (32 );
5668
- size_t delta_n (16 );
5667
+ size_t n_wi (4 );
5668
+ size_t delta_n (4 );
5669
5669
5670
5670
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
5671
5671
local_mem_size, reserved_slm_size, delta_k,
@@ -5705,8 +5705,8 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q,
5705
5705
else if (k > n && k > m) {
5706
5706
constexpr size_t m_groups = 2 ;
5707
5707
size_t delta_k (4 );
5708
- size_t n_wi (32 );
5709
- size_t delta_n (16 );
5708
+ size_t n_wi (4 );
5709
+ size_t delta_n (4 );
5710
5710
5711
5711
gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
5712
5712
local_mem_size, reserved_slm_size, delta_k,
@@ -6484,7 +6484,7 @@ gemm_batch_tree_impl(sycl::queue &exec_q,
6484
6484
6485
6485
if ((k > n && k > m) || m == 1 ) {
6486
6486
size_t delta_k (4 );
6487
- size_t n_wi (32 );
6487
+ size_t n_wi (4 );
6488
6488
size_t delta_n (4 );
6489
6489
6490
6490
using dpctl::tensor::type_utils::is_complex;
@@ -8187,7 +8187,7 @@ gemm_batch_contig_tree_impl(sycl::queue &exec_q,
8187
8187
8188
8188
if ((k > n && k > m) || m == 1 ) {
8189
8189
size_t delta_k (4 );
8190
- size_t n_wi (32 );
8190
+ size_t n_wi (4 );
8191
8191
size_t delta_n (4 );
8192
8192
8193
8193
using dpctl::tensor::type_utils::is_complex;
0 commit comments