@@ -8033,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32(
8033
8033
#if defined(GGML_USE_CUBLAS )
8034
8034
const float alpha = 1.0f ;
8035
8035
const float beta = 0.0f ;
8036
- const int x_ne = ne01 * ne10 ;
8036
+ const int x_ne = ne01 * ne00 ;
8037
8037
const int y_ne = ne11 * ne10 ;
8038
8038
const int d_ne = ne11 * ne01 ;
8039
8039
@@ -8235,25 +8235,27 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8235
8235
}
8236
8236
8237
8237
#if defined(GGML_USE_CUBLAS )
8238
- ggml_fp16_t * const wdata = params -> wdata ;
8239
-
8240
8238
const float alpha = 1.0f ;
8241
8239
const float beta = 0.0f ;
8242
- const int x_ne = ne01 * ne10 ;
8240
+ const int x_ne = ne01 * ne00 ;
8243
8241
const int y_ne = ne11 * ne10 ;
8244
8242
const int d_ne = ne11 * ne01 ;
8245
8243
8246
8244
size_t x_size , y_size , d_size ;
8247
- float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8248
- float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8249
- float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8245
+ ggml_fp16_t * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8246
+ ggml_fp16_t * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8247
+ float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8250
8248
#else
8251
8249
float * const wdata = params -> wdata ;
8252
8250
#endif
8253
8251
for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
8254
8252
for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
8255
8253
#if defined(GGML_USE_CUBLAS )
8254
+ // copy src0 while converting src1
8255
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
8256
+
8256
8257
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8258
+ ggml_fp16_t * const wdata = (ggml_fp16_t * ) params -> wdata + (ne11 * ne10 ) * (i03 * ne02 + i02 );
8257
8259
{
8258
8260
size_t id = 0 ;
8259
8261
for (int64_t i01 = 0 ; i01 < ne11 ; ++ i01 ) {
@@ -8275,11 +8277,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8275
8277
8276
8278
#if defined(GGML_USE_CUBLAS )
8277
8279
const ggml_fp16_t * y = (ggml_fp16_t * ) wdata ;
8278
-
8279
8280
float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
8280
8281
8281
8282
// copy data to device
8282
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
8283
8283
CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof (ggml_fp16_t ) * y_ne , cudaMemcpyHostToDevice , g_cudaStream ));
8284
8284
8285
8285
// compute
@@ -8498,39 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
8498
8498
#if defined(GGML_USE_CUBLAS )
8499
8499
const float alpha = 1.0f ;
8500
8500
const float beta = 0.0f ;
8501
- const int x_ne = ne01 * ne10 ;
8501
+ const int x_ne = ne01 * ne00 ;
8502
8502
const int y_ne = ne11 * ne10 ;
8503
8503
const int d_ne = ne11 * ne01 ;
8504
8504
8505
8505
size_t x_size , y_size , d_size , q_size ;
8506
- float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8507
- float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8508
- float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8509
- float * d_Q = ggml_cuda_pool_malloc (GGML_TYPE_SIZE [type ] * x_ne / GGML_BLCK_SIZE [type ], & q_size );
8506
+ float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8507
+ float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8508
+ float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8509
+ void * d_Q = ggml_cuda_pool_malloc (GGML_TYPE_SIZE [type ] * x_ne / GGML_BLCK_SIZE [type ], & q_size );
8510
8510
8511
- void (* dequantize_row_q_cuda )(const void * x , float * y , int k , cudaStream_t stream ) = NULL ;
8512
- if (type == GGML_TYPE_Q4_0 ) {
8513
- dequantize_row_q_cuda = dequantize_row_q4_0_cuda ;
8514
- }
8515
- else if (type == GGML_TYPE_Q4_1 ) {
8516
- dequantize_row_q_cuda = dequantize_row_q4_1_cuda ;
8517
- }
8518
- else if (type == GGML_TYPE_Q4_2 ) {
8519
- dequantize_row_q_cuda = dequantize_row_q4_2_cuda ;
8520
- }
8521
- else if (type == GGML_TYPE_Q5_0 ) {
8522
- dequantize_row_q_cuda = dequantize_row_q5_0_cuda ;
8523
- }
8524
- else if (type == GGML_TYPE_Q5_1 ) {
8525
- dequantize_row_q_cuda = dequantize_row_q5_1_cuda ;
8526
- }
8527
- else if (type == GGML_TYPE_Q8_0 ) {
8528
- dequantize_row_q_cuda = dequantize_row_q8_0_cuda ;
8529
- }
8530
- else {
8531
- GGML_ASSERT (false);
8532
- }
8533
- #elif !defined(GGML_USE_CLBLAST )
8511
+ const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda (type );
8512
+ GGML_ASSERT (dequantize_row_q_cuda != NULL );
8513
+ #else
8534
8514
float * const wdata = params -> wdata ;
8535
8515
dequantize_row_q_t const dequantize_row_q = quantize_fns [type ].dequantize_row_q ;
8536
8516
#endif
@@ -8543,10 +8523,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
8543
8523
8544
8524
#if defined(GGML_USE_CUBLAS )
8545
8525
// copy and dequantize on device
8546
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream ));
8526
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream2 ));
8547
8527
8548
- dequantize_row_q_cuda (d_Q , d_X , ne01 * ne00 , g_cudaStream );
8528
+ dequantize_row_q_cuda (d_Q , d_X , x_ne , g_cudaStream2 );
8549
8529
CUDA_CHECK (cudaGetLastError ());
8530
+ CUDA_CHECK (cudaEventRecord (g_cudaEvent , g_cudaStream2 ));
8550
8531
#elif defined(GGML_USE_CLBLAST )
8551
8532
const void * x = (char * ) src0 -> data + i03 * nb03 + i02 * nb02 ;
8552
8533
#else
@@ -8560,11 +8541,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
8560
8541
const float * x = wdata ;
8561
8542
#endif
8562
8543
8563
-
8564
8544
#if defined(GGML_USE_CUBLAS )
8565
8545
// copy data to device
8566
8546
CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Y , src1 , i03 , i02 , g_cudaStream ));
8567
8547
8548
+ // wait for dequantization
8549
+ CUDA_CHECK (cudaStreamWaitEvent (g_cudaStream , g_cudaEvent , 0 ));
8550
+
8568
8551
// compute
8569
8552
CUBLAS_CHECK (
8570
8553
cublasSgemm (g_cublasH , CUBLAS_OP_T , CUBLAS_OP_N ,
@@ -11588,7 +11571,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11588
11571
if (ggml_compute_forward_mul_mat_use_blas (node -> src0 , node -> src1 , node )) {
11589
11572
node -> n_tasks = 1 ; // TODO: this actually is doing nothing
11590
11573
// the threads are still spinning
11591
- cur = GGML_TYPE_SIZE [GGML_TYPE_F32 ]* ( node -> src0 -> ne [ 0 ] * node -> src0 -> ne [ 1 ] );
11574
+ cur = GGML_TYPE_SIZE [GGML_TYPE_F32 ]* MAX ( ggml_nelements ( node -> src1 ), ggml_nelements ( node -> src0 ) );
11592
11575
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
11593
11576
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
11594
11577
//printf("cur = %zu\n", cur);
@@ -11600,6 +11583,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11600
11583
#endif
11601
11584
} else if (node -> src0 -> type == GGML_TYPE_F32 && node -> src1 -> type == GGML_TYPE_F32 ) {
11602
11585
cur = 0 ;
11586
+ #if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS ) || defined(GGML_USE_CUBLAS )
11587
+ if (ggml_compute_forward_mul_mat_use_blas (node -> src0 , node -> src1 , node )) {
11588
+ node -> n_tasks = 1 ;
11589
+ }
11590
+ #endif
11603
11591
} else if (ggml_is_quantized (node -> src0 -> type ) && node -> src1 -> type == GGML_TYPE_F32 ) {
11604
11592
#if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS ) || defined(GGML_USE_CUBLAS ) || defined(GGML_USE_CLBLAST )
11605
11593
if (ggml_compute_forward_mul_mat_use_blas (node -> src0 , node -> src1 , node )) {
0 commit comments