Skip to content

Commit a15ef8f

Browse files
CUDA: fix partial offloading for ne0 % 256 != 0 (#8572)
1 parent 705b7ec commit a15ef8f

File tree

4 files changed

+29
-15
lines changed

4 files changed

+29
-15
lines changed

ggml/include/ggml-backend.h

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,23 @@ extern "C" {
2929
enum ggml_backend_buffer_usage {
3030
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
3131
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
32+
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
3233
};
3334

34-
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
35-
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
36-
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
37-
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
38-
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
39-
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
40-
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
41-
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
42-
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
43-
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
44-
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
45-
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
46-
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
35+
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
36+
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
37+
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
38+
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
39+
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40+
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
41+
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
42+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
43+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
44+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
45+
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
46+
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
47+
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
48+
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
4749

4850
//
4951
// Backend

ggml/src/ggml-alloc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
776776
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
777777
return false;
778778
}
779+
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
779780
}
780781
}
781782

ggml/src/ggml-backend.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
134134
}
135135
}
136136

137+
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
138+
return buffer->usage;
139+
}
140+
137141
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
138142
return buffer->buft;
139143
}

ggml/src/ggml-cuda.cu

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -464,12 +464,12 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
464464
return;
465465
}
466466

467-
if (ggml_is_quantized(tensor->type)) {
467+
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
468468
// initialize padding to 0 to avoid possible NaN values
469469
size_t original_size = ggml_nbytes(tensor);
470470
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
471471

472-
if (padded_size > original_size && tensor->view_src == nullptr) {
472+
if (padded_size > original_size) {
473473
ggml_cuda_set_device(ctx->device);
474474
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
475475
}
@@ -1485,6 +1485,13 @@ static void ggml_cuda_op_mul_mat(
14851485
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
14861486
}
14871487

1488+
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
1489+
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
1490+
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
1491+
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1492+
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
1493+
}
1494+
14881495
if (src1_on_device && src1_is_contiguous) {
14891496
dev[id].src1_ddf = (float *) src1->data;
14901497
} else {

0 commit comments

Comments
 (0)