@@ -5126,11 +5126,11 @@ static void ggml_cpy_f32_q8_0_cuda(
5126
5126
if (first_incomplete && last_incomplete) {
5127
5127
GGML_ASSERT (i_blck_0 + ne00 < QK8_0); // otherwise there would be a race condition
5128
5128
GGML_ASSERT (pad == false );
5129
- cpy_f32_q8_0<true , true , false ><<<block_nums, block_dims, 0 , stream>>>
5129
+ cpy_f32_q8_0<true , true , true ><<<block_nums, block_dims, 0 , stream>>>
5130
5130
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5131
5131
} else if (first_incomplete && !last_incomplete) {
5132
5132
GGML_ASSERT (pad == false );
5133
- cpy_f32_q8_0<true , false , false ><<<block_nums, block_dims, 0 , stream>>>
5133
+ cpy_f32_q8_0<true , false , true ><<<block_nums, block_dims, 0 , stream>>>
5134
5134
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5135
5135
} else if (!first_incomplete && last_incomplete && pad) {
5136
5136
cpy_f32_q8_0<false , true , false ><<<block_nums, block_dims, 0 , stream>>>
@@ -5139,7 +5139,7 @@ static void ggml_cpy_f32_q8_0_cuda(
5139
5139
cpy_f32_q8_0<false , true , true ><<<block_nums, block_dims, 0 , stream>>>
5140
5140
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5141
5141
} else if (!first_incomplete && !last_incomplete && pad) {
5142
- cpy_f32_q8_0<false , false , true ><<<block_nums, block_dims, 0 , stream>>>
5142
+ cpy_f32_q8_0<false , false , false ><<<block_nums, block_dims, 0 , stream>>>
5143
5143
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5144
5144
} else if (!first_incomplete && !last_incomplete && !pad) {
5145
5145
cpy_f32_q8_0<false , false , true ><<<block_nums, block_dims, 0 , stream>>>
0 commit comments