Skip to content

Commit 7fc50c0

Browse files
authored
cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
1 parent b1ee8f5 commit 7fc50c0

File tree

6 files changed

+110
-54
lines changed

6 files changed

+110
-54
lines changed

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS
106106
endif
107107
ifdef LLAMA_CUBLAS
108108
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
109+
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
109110
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
110111
OBJS += ggml-cuda.o
111112
NVCC = nvcc
@@ -164,10 +165,10 @@ $(info )
164165
# Build library
165166
#
166167

167-
ggml.o: ggml.c ggml.h
168+
ggml.o: ggml.c ggml.h ggml-cuda.h
168169
$(CC) $(CFLAGS) -c $< -o $@
169170

170-
llama.o: llama.cpp ggml.h llama.h llama_util.h
171+
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
171172
$(CXX) $(CXXFLAGS) -c $< -o $@
172173

173174
common.o: examples/common.cpp examples/common.h

ggml-cuda.cu

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,25 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st
227227
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
228228
}
229229

230+
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
231+
switch (type) {
232+
case GGML_TYPE_Q4_0:
233+
return dequantize_row_q4_0_cuda;
234+
case GGML_TYPE_Q4_1:
235+
return dequantize_row_q4_1_cuda;
236+
case GGML_TYPE_Q4_2:
237+
return dequantize_row_q4_2_cuda;
238+
case GGML_TYPE_Q5_0:
239+
return dequantize_row_q5_0_cuda;
240+
case GGML_TYPE_Q5_1:
241+
return dequantize_row_q5_1_cuda;
242+
case GGML_TYPE_Q8_0:
243+
return dequantize_row_q8_0_cuda;
244+
default:
245+
return nullptr;
246+
}
247+
}
248+
230249
// buffer pool for cuda
231250
#define MAX_CUDA_BUFFERS 16
232251

@@ -286,18 +305,22 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
286305
CUDA_CHECK(cudaFree(ptr));
287306
}
288307

289-
cublasHandle_t g_cublasH = NULL;
290-
cudaStream_t g_cudaStream = NULL;
308+
cublasHandle_t g_cublasH = nullptr;
309+
cudaStream_t g_cudaStream = nullptr;
310+
cudaStream_t g_cudaStream2 = nullptr;
311+
cudaEvent_t g_cudaEvent = nullptr;
291312

292-
void ggml_init_cublas(void) {
293-
if (g_cublasH == NULL) {
313+
void ggml_init_cublas() {
314+
if (g_cublasH == nullptr) {
294315
// create cublas handle, bind a stream
295316
CUBLAS_CHECK(cublasCreate(&g_cublasH));
296-
297317
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
298-
299318
CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
300319

320+
// create additional stream and event for synchronization
321+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
322+
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
323+
301324
// configure logging to stdout
302325
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
303326
}
@@ -330,3 +353,13 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
330353
return cudaSuccess;
331354
}
332355
}
356+
357+
void * ggml_cuda_host_malloc(size_t size) {
358+
void * ptr;
359+
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
360+
return ptr;
361+
}
362+
363+
void ggml_cuda_host_free(void * ptr) {
364+
CUDA_CHECK(cudaFreeHost(ptr));
365+
}

ggml-cuda.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,14 @@ extern "C" {
2626
} while (0)
2727

2828
extern cublasHandle_t g_cublasH;
29-
extern cudaStream_t g_cudaStream;
29+
extern cudaStream_t g_cudaStream;
30+
extern cudaStream_t g_cudaStream2;
31+
extern cudaEvent_t g_cudaEvent;
3032

3133
void ggml_init_cublas(void);
34+
void * ggml_cuda_host_malloc(size_t size);
35+
void ggml_cuda_host_free(void * ptr);
36+
3237
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
3338
void ggml_cuda_pool_free(void * ptr, size_t size);
3439

@@ -41,6 +46,9 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st
4146

4247
cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
4348

49+
typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
50+
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
51+
4452
#ifdef __cplusplus
4553
}
4654
#endif

ggml.c

Lines changed: 29 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -8033,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32(
80338033
#if defined(GGML_USE_CUBLAS)
80348034
const float alpha = 1.0f;
80358035
const float beta = 0.0f;
8036-
const int x_ne = ne01 * ne10;
8036+
const int x_ne = ne01 * ne00;
80378037
const int y_ne = ne11 * ne10;
80388038
const int d_ne = ne11 * ne01;
80398039

@@ -8235,25 +8235,27 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82358235
}
82368236

82378237
#if defined(GGML_USE_CUBLAS)
8238-
ggml_fp16_t * const wdata = params->wdata;
8239-
82408238
const float alpha = 1.0f;
82418239
const float beta = 0.0f;
8242-
const int x_ne = ne01 * ne10;
8240+
const int x_ne = ne01 * ne00;
82438241
const int y_ne = ne11 * ne10;
82448242
const int d_ne = ne11 * ne01;
82458243

82468244
size_t x_size, y_size, d_size;
8247-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8248-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8249-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8245+
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8246+
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8247+
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
82508248
#else
82518249
float * const wdata = params->wdata;
82528250
#endif
82538251
for (int64_t i03 = 0; i03 < ne03; i03++) {
82548252
for (int64_t i02 = 0; i02 < ne02; i02++) {
82558253
#if defined(GGML_USE_CUBLAS)
8254+
// copy src0 while converting src1
8255+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
8256+
82568257
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8258+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
82578259
{
82588260
size_t id = 0;
82598261
for (int64_t i01 = 0; i01 < ne11; ++i01) {
@@ -8275,11 +8277,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82758277

82768278
#if defined(GGML_USE_CUBLAS)
82778279
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
8278-
82798280
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
82808281

82818282
// copy data to device
8282-
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
82838283
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
82848284

82858285
// compute
@@ -8498,39 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
84988498
#if defined(GGML_USE_CUBLAS)
84998499
const float alpha = 1.0f;
85008500
const float beta = 0.0f;
8501-
const int x_ne = ne01 * ne10;
8501+
const int x_ne = ne01 * ne00;
85028502
const int y_ne = ne11 * ne10;
85038503
const int d_ne = ne11 * ne01;
85048504

85058505
size_t x_size, y_size, d_size, q_size;
8506-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8507-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8508-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8509-
float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
8506+
float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8507+
float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8508+
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8509+
void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
85108510

8511-
void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL;
8512-
if (type == GGML_TYPE_Q4_0) {
8513-
dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
8514-
}
8515-
else if (type == GGML_TYPE_Q4_1) {
8516-
dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
8517-
}
8518-
else if (type == GGML_TYPE_Q4_2) {
8519-
dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
8520-
}
8521-
else if (type == GGML_TYPE_Q5_0) {
8522-
dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
8523-
}
8524-
else if (type == GGML_TYPE_Q5_1) {
8525-
dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
8526-
}
8527-
else if (type == GGML_TYPE_Q8_0) {
8528-
dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
8529-
}
8530-
else {
8531-
GGML_ASSERT(false);
8532-
}
8533-
#elif !defined(GGML_USE_CLBLAST)
8511+
const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
8512+
GGML_ASSERT(dequantize_row_q_cuda != NULL);
8513+
#else
85348514
float * const wdata = params->wdata;
85358515
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
85368516
#endif
@@ -8543,10 +8523,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
85438523

85448524
#if defined(GGML_USE_CUBLAS)
85458525
// copy and dequantize on device
8546-
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream));
8526+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2));
85478527

8548-
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
8528+
dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
85498529
CUDA_CHECK(cudaGetLastError());
8530+
CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2));
85508531
#elif defined(GGML_USE_CLBLAST)
85518532
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
85528533
#else
@@ -8560,11 +8541,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
85608541
const float * x = wdata;
85618542
#endif
85628543

8563-
85648544
#if defined(GGML_USE_CUBLAS)
85658545
// copy data to device
85668546
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
85678547

8548+
// wait for dequantization
8549+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
8550+
85688551
// compute
85698552
CUBLAS_CHECK(
85708553
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
@@ -11588,7 +11571,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1158811571
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
1158911572
node->n_tasks = 1; // TODO: this actually is doing nothing
1159011573
// the threads are still spinning
11591-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
11574+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
1159211575
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
1159311576
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
1159411577
//printf("cur = %zu\n", cur);
@@ -11600,6 +11583,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1160011583
#endif
1160111584
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
1160211585
cur = 0;
11586+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
11587+
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11588+
node->n_tasks = 1;
11589+
}
11590+
#endif
1160311591
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
1160411592
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1160511593
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {

llama.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ struct llama_kv_cache {
136136

137137
struct ggml_context * ctx = NULL;
138138

139-
llama_buffer buf;
139+
llama_ctx_buffer buf;
140140

141141
int n; // number of tokens currently in the cache
142142

@@ -167,7 +167,7 @@ struct llama_model {
167167
struct llama_kv_cache kv_self;
168168

169169
// the model memory buffer
170-
llama_buffer buf;
170+
llama_ctx_buffer buf;
171171

172172
// model memory mapped file
173173
std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +228,8 @@ struct llama_context {
228228

229229
// memory buffers used to evaluate the model
230230
// TODO: move in llama_state
231-
llama_buffer buf_compute;
232-
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
231+
llama_ctx_buffer buf_compute;
232+
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
233233

234234
int buf_last = 0;
235235
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };

llama_util.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,4 +405,30 @@ struct llama_buffer {
405405
delete[] addr;
406406
}
407407
};
408+
409+
#ifdef GGML_USE_CUBLAS
410+
#include "ggml-cuda.h"
411+
struct llama_ctx_buffer {
412+
uint8_t * addr = NULL;
413+
size_t size = 0;
414+
415+
void resize(size_t size) {
416+
if (addr) {
417+
ggml_cuda_host_free(addr);
418+
}
419+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
420+
this->size = size;
421+
}
422+
423+
~llama_ctx_buffer() {
424+
if (addr) {
425+
ggml_cuda_host_free(addr);
426+
}
427+
}
428+
};
429+
#else
430+
typedef llama_buffer llama_ctx_buffer;
431+
#endif
432+
433+
408434
#endif

0 commit comments

Comments
 (0)