Skip to content

Commit 20a68a7

Browse files
LostRuinsggerganov
andauthored
ggml : add ggml_row_size() (fixes llama out of space) (#4461)
* Fixes "Not enough space in the context's memory pool" encountered on certain models, which seems to be caused by some imprecision related to the automatic casting of floating point values * do not cast to size_t, instead just use doubles * ggml : add ggml_row_size(), deprecate ggml_type_sizef() * ggml : fix row size compute to avoid overflows * tests : fix sizey -> sizez --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 55e87c3 commit 20a68a7

File tree

4 files changed

+27
-18
lines changed

4 files changed

+27
-18
lines changed

examples/benchmark/benchmark-matmult.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,13 @@ int main(int argc, char ** argv) {
129129
const ggml_type qtype = GGML_TYPE_Q4_1;
130130

131131
size_t ctx_size = 0;
132-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
133-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
134-
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
135-
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
136-
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
137-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
138-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
132+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
133+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
134+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
135+
ctx_size += ggml_row_size(qtype, sizex*sizey);
136+
ctx_size += ggml_row_size(qtype, sizex*sizey);
137+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
138+
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
139139
ctx_size += 1024*1024*16;
140140

141141
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));

ggml.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2011,8 +2011,13 @@ size_t ggml_type_size(enum ggml_type type) {
20112011
return type_traits[type].type_size;
20122012
}
20132013

2014-
float ggml_type_sizef(enum ggml_type type) {
2015-
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
2014+
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2015+
assert(ne % ggml_blck_size(type) == 0);
2016+
return ggml_type_size(type)*ne/ggml_blck_size(type);
2017+
}
2018+
2019+
double ggml_type_sizef(enum ggml_type type) {
2020+
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
20162021
}
20172022

20182023
const char * ggml_type_name(enum ggml_type type) {

ggml.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -641,9 +641,13 @@ extern "C" {
641641
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
642642
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
643643

644-
GGML_API int ggml_blck_size (enum ggml_type type);
645-
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
646-
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
644+
GGML_API int ggml_blck_size(enum ggml_type type);
645+
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
646+
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
647+
648+
GGML_DEPRECATED(
649+
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
650+
"use ggml_row_size() instead");
647651

648652
GGML_API const char * ggml_type_name(enum ggml_type type);
649653
GGML_API const char * ggml_op_name (enum ggml_op op);

llama.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,7 +1555,7 @@ static bool llama_kv_cache_init(
15551555
cache.cells.clear();
15561556
cache.cells.resize(n_ctx);
15571557

1558-
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
1558+
cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead());
15591559
memset(cache.buf.data, 0, cache.buf.size);
15601560

15611561
struct ggml_init_params params;
@@ -3822,8 +3822,8 @@ static void llm_build_k_shift(
38223822
ggml_rope_custom_inplace(ctx,
38233823
ggml_view_3d(ctx, kv.k_l[il],
38243824
n_embd_head, n_head_kv, n_ctx,
3825-
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
3826-
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
3825+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
3826+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
38273827
0),
38283828
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
38293829
ext_factor, attn_factor, beta_fast, beta_slow);
@@ -3852,7 +3852,7 @@ static void llm_build_kv_store(
38523852
cb(v_cur_t, "v_cur_t", il);
38533853

38543854
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3855-
(ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
3855+
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
38563856
cb(k_cache_view, "k_cache_view", il);
38573857

38583858
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
@@ -4011,8 +4011,8 @@ static struct ggml_tensor * llm_build_kqv(
40114011
struct ggml_tensor * k =
40124012
ggml_view_3d(ctx, kv.k_l[il],
40134013
n_embd_head, n_kv, n_head_kv,
4014-
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
4015-
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
4014+
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4015+
ggml_row_size(kv.k_l[il]->type, n_embd_head),
40164016
0);
40174017
cb(k, "k", il);
40184018

0 commit comments

Comments
 (0)