Skip to content

Commit c620f4d

Browse files
KV cache quantized to q8_0
1 parent 111163e commit c620f4d

File tree

15 files changed

+1024
-370
lines changed

15 files changed

+1024
-370
lines changed

common/common.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
198198
break;
199199
}
200200
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
201+
} else if (arg == "--kv-type" || arg == "-kvt") {
202+
if (++i >= argc) {
203+
invalid_param = true;
204+
break;
205+
}
206+
207+
std::string type_name(argv[i]);
208+
for (char & c : type_name) {
209+
c = std::tolower(c);
210+
}
211+
212+
if (type_name == "q8_0") {
213+
params.kv_type = GGML_TYPE_Q8_0;
214+
} else if (type_name == "f16") {
215+
params.kv_type = GGML_TYPE_F16;
216+
} else if (type_name == "f32") {
217+
params.kv_type = GGML_TYPE_F32;
218+
} else {
219+
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
220+
invalid_param = true;
221+
break;
222+
}
201223
} else if (arg == "--memory-f32") {
202-
params.memory_f16 = false;
224+
params.kv_type = GGML_TYPE_F32;
203225
} else if (arg == "--top-p") {
204226
if (++i >= argc) {
205227
invalid_param = true;
@@ -652,8 +674,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
652674
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
653675
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
654676
printf(" --no-penalize-nl do not penalize newline token\n");
655-
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
656-
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
677+
printf(" -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
657678
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
658679
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
659680
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -735,7 +756,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
735756
lparams.low_vram = params.low_vram;
736757
lparams.mul_mat_q = params.mul_mat_q;
737758
lparams.seed = params.seed;
738-
lparams.f16_kv = params.memory_f16;
759+
lparams.kv_type = params.kv_type;
739760
lparams.use_mmap = params.use_mmap;
740761
lparams.use_mlock = params.use_mlock;
741762
lparams.logits_all = params.perplexity;
@@ -1201,6 +1222,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12011222
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
12021223
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
12031224
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
1225+
fprintf(stream, "kv_type: %s # default: false\n", ggml_type_name(params.kv_type));
12041226
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
12051227

12061228
fprintf(stream, "logit_bias:\n");
@@ -1215,7 +1237,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12151237
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
12161238
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
12171239
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1218-
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
12191240
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
12201241
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
12211242
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);

common/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,10 @@ struct gpt_params {
9494
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
9595
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
9696

97+
ggml_type kv_type = GGML_TYPE_Q8_0; // the type to use for the KV cache
98+
9799
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
98100
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
99-
bool memory_f16 = true; // use f16 instead of f32 for memory kv
100101
bool random_prompt = false; // do not randomize prompt if none provided
101102
bool use_color = false; // use color to distinguish generations and inputs
102103
bool interactive = false; // interactive mode

examples/llama-bench/llama-bench.cpp

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ struct cmd_params {
127127
std::vector<int> n_prompt;
128128
std::vector<int> n_gen;
129129
std::vector<int> n_batch;
130-
std::vector<bool> f32_kv;
130+
std::vector<ggml_type> kv_type;
131131
std::vector<int> n_threads;
132132
std::vector<int> n_gpu_layers;
133133
std::vector<int> main_gpu;
@@ -144,7 +144,7 @@ static const cmd_params cmd_params_defaults = {
144144
/* n_prompt */ {512},
145145
/* n_gen */ {128},
146146
/* n_batch */ {512},
147-
/* f32_kv */ {false},
147+
/* kv_type */ {GGML_TYPE_Q8_0},
148148
/* n_threads */ {get_num_physical_cores()},
149149
/* n_gpu_layers */ {99},
150150
/* main_gpu */ {0},
@@ -165,7 +165,16 @@ static void print_usage(int /* argc */, char ** argv) {
165165
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
166166
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
167167
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
168-
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
168+
169+
std::string kv_type_default;
170+
for (unsigned int i = 0; i < cmd_params_defaults.kv_type.size(); ++i) {
171+
if (i > 0) {
172+
kv_type_default += ",";
173+
}
174+
kv_type_default += ggml_type_name(cmd_params_defaults.kv_type[i]);
175+
}
176+
printf(" -kvt, --kv_type <q8_0|f16|f32> (default: %s)\n", kv_type_default.c_str());
177+
169178
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
170179
printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
171180
printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -177,7 +186,6 @@ static void print_usage(int /* argc */, char ** argv) {
177186
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
178187
printf("\n");
179188
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
180-
181189
}
182190

183191
static cmd_params parse_cmd_params(int argc, char ** argv) {
@@ -228,13 +236,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
228236
}
229237
auto p = split<int>(argv[i], split_delim);
230238
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
231-
} else if (arg == "--memory-f32") {
239+
} else if (arg == "-kvt" || arg == "--kv-type") {
232240
if (++i >= argc) {
233241
invalid_param = true;
234242
break;
235243
}
236-
auto p = split<int>(argv[i], split_delim);
237-
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
244+
auto p = split<std::string>(argv[i], split_delim);
245+
246+
std::vector<ggml_type> kvt;
247+
for (const std::string & type_name : p) {
248+
if (type_name == "q8_0") {
249+
kvt.push_back(GGML_TYPE_Q8_0);
250+
} else if (type_name == "f16") {
251+
kvt.push_back(GGML_TYPE_F16);
252+
} else if (type_name == "f32") {
253+
kvt.push_back(GGML_TYPE_F32);
254+
} else {
255+
invalid_param = true;
256+
break;
257+
}
258+
}
259+
if (invalid_param) {
260+
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
261+
break;
262+
}
263+
264+
params.kv_type.insert(params.kv_type.end(), kvt.begin(), kvt.end());
238265
} else if (arg == "-t" || arg == "--threads") {
239266
if (++i >= argc) {
240267
invalid_param = true;
@@ -332,7 +359,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
332359
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
333360
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
334361
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
335-
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
362+
if (params.kv_type.empty()) { params.kv_type = cmd_params_defaults.kv_type; }
336363
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
337364
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
338365
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -348,7 +375,7 @@ struct cmd_params_instance {
348375
int n_prompt;
349376
int n_gen;
350377
int n_batch;
351-
bool f32_kv;
378+
ggml_type kv_type;
352379
int n_threads;
353380
int n_gpu_layers;
354381
int main_gpu;
@@ -360,7 +387,7 @@ struct cmd_params_instance {
360387
llama_context_params lparams = llama_context_default_params();
361388
lparams.n_ctx = n_prompt + n_gen;
362389
lparams.n_batch = n_batch;
363-
lparams.f16_kv = !f32_kv;
390+
lparams.kv_type = kv_type;
364391
lparams.n_gpu_layers = n_gpu_layers;
365392
lparams.main_gpu = main_gpu;
366393
lparams.mul_mat_q = mul_mat_q;
@@ -376,7 +403,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
376403

377404
for (const auto & m : params.model)
378405
for (const auto & nb : params.n_batch)
379-
for (const auto & fk : params.f32_kv)
406+
for (const auto & kvt : params.kv_type)
380407
for (const auto & nl : params.n_gpu_layers)
381408
for (const auto & mg : params.main_gpu)
382409
for (const auto & mmq : params.mul_mat_q)
@@ -388,7 +415,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
388415
/* .n_prompt = */ n_prompt,
389416
/* .n_gen = */ n_gen,
390417
/* .n_batch = */ nb,
391-
/* .f32_kv = */ fk,
418+
/* .kv_type = */ kvt,
392419
/* .n_threads = */ nt,
393420
/* .n_gpu_layers = */ nl,
394421
/* .main_gpu = */ mg,
@@ -439,7 +466,7 @@ struct test {
439466
uint64_t model_n_params;
440467
int n_batch;
441468
int n_threads;
442-
bool f32_kv;
469+
ggml_type kv_type;
443470
int n_gpu_layers;
444471
int main_gpu;
445472
bool mul_mat_q;
@@ -459,7 +486,7 @@ struct test {
459486
model_n_params = llama_model_n_params(lmodel);
460487
n_batch = inst.n_batch;
461488
n_threads = inst.n_threads;
462-
f32_kv = inst.f32_kv;
489+
kv_type = inst.kv_type;
463490
n_gpu_layers = inst.n_gpu_layers;
464491
main_gpu = inst.main_gpu;
465492
mul_mat_q = inst.mul_mat_q;
@@ -523,7 +550,7 @@ struct test {
523550
"cuda", "opencl", "metal", "gpu_blas", "blas",
524551
"cpu_info", "gpu_info",
525552
"model_filename", "model_type", "model_size", "model_n_params",
526-
"n_batch", "n_threads", "f16_kv",
553+
"n_batch", "n_threads", "kv_type",
527554
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
528555
"n_prompt", "n_gen", "test_time",
529556
"avg_ns", "stddev_ns",
@@ -543,7 +570,7 @@ struct test {
543570
return INT;
544571
}
545572
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
546-
field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
573+
field == "mul_mat_q" || field == "low_vram") {
547574
return BOOL;
548575
}
549576
if (field == "avg_ts" || field == "stddev_ts") {
@@ -573,7 +600,7 @@ struct test {
573600
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
574601
cpu_info, gpu_info,
575602
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
576-
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
603+
std::to_string(n_batch), std::to_string(n_threads), std::string(ggml_type_name(kv_type)),
577604
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
578605
std::to_string(n_prompt), std::to_string(n_gen), test_time,
579606
std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -757,8 +784,8 @@ struct markdown_printer : public printer {
757784
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
758785
fields.push_back("n_batch");
759786
}
760-
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
761-
fields.push_back("f16_kv");
787+
if (params.kv_type.size() > 1 || params.kv_type != cmd_params_defaults.kv_type) {
788+
fields.push_back("kv_type");
762789
}
763790
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
764791
fields.push_back("main_gpu");
@@ -826,6 +853,9 @@ struct markdown_printer : public printer {
826853
} else if (field == "t/s") {
827854
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
828855
value = buf;
856+
} else if (field == "kv_type") {
857+
snprintf(buf, sizeof(buf), "%s", ggml_type_name(t.kv_type));
858+
value = buf;
829859
} else if (vmap.find(field) != vmap.end()) {
830860
value = vmap.at(field);
831861
} else {

examples/main/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,9 @@ These options help improve the performance and memory usage of the LLaMA models.
276276

277277
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
278278

279-
### Memory Float 32
279+
### KV cache type
280280

281-
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
281+
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
282282

283283
### Batch Size
284284

examples/main/main.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,15 @@
3636
static llama_context ** g_ctx;
3737
static llama_model ** g_model;
3838
static gpt_params * g_params;
39-
static std::vector<llama_token> * g_input_tokens;
39+
static std::vector<llama_token> * g_embd_inp;
4040
static std::ostringstream * g_output_ss;
4141
static std::vector<llama_token> * g_output_tokens;
4242
static bool is_interacting = false;
4343

4444

4545
static void write_logfile(
4646
const llama_context * ctx, const gpt_params & params, const llama_model * model,
47-
const std::vector<llama_token> & input_tokens, const std::string & output,
47+
const std::vector<llama_token> & embd_inp, const std::string & output,
4848
const std::vector<llama_token> & output_tokens
4949
) {
5050
if (params.logdir.empty()) {
@@ -71,7 +71,7 @@ static void write_logfile(
7171
fprintf(logfile, "binary: main\n");
7272
char model_desc[128];
7373
llama_model_desc(model, model_desc, sizeof(model_desc));
74-
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
74+
dump_non_result_info_yaml(logfile, params, ctx, timestamp, embd_inp, model_desc);
7575

7676
fprintf(logfile, "\n");
7777
fprintf(logfile, "######################\n");
@@ -95,7 +95,7 @@ static void sigint_handler(int signo) {
9595
console::cleanup();
9696
printf("\n");
9797
llama_print_timings(*g_ctx);
98-
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
98+
write_logfile(*g_ctx, *g_params, *g_model, *g_embd_inp, g_output_ss->str(), *g_output_tokens);
9999
_exit(130);
100100
}
101101
}
@@ -238,7 +238,7 @@ int main(int argc, char ** argv) {
238238
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
239239
LOG("add_bos: %d\n", add_bos);
240240

241-
std::vector<llama_token> embd_inp;
241+
std::vector<llama_token> embd_inp; g_embd_inp = &embd_inp;
242242

243243
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
244244
LOG("tokenize the prompt\n");
@@ -465,7 +465,6 @@ int main(int argc, char ** argv) {
465465
int n_session_consumed = 0;
466466
int n_past_guidance = 0;
467467

468-
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
469468
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
470469
std::ostringstream output_ss; g_output_ss = &output_ss;
471470

@@ -661,9 +660,7 @@ int main(int argc, char ** argv) {
661660
const std::string token_str = llama_token_to_piece(ctx, id);
662661
printf("%s", token_str.c_str());
663662

664-
if (embd.size() > 1) {
665-
input_tokens.push_back(id);
666-
} else {
663+
if (embd.size() == 1) {
667664
output_tokens.push_back(id);
668665
output_ss << token_str;
669666
}
@@ -843,7 +840,7 @@ int main(int argc, char ** argv) {
843840
}
844841

845842
llama_print_timings(ctx);
846-
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
843+
write_logfile(ctx, params, model, embd_inp, output_ss.str(), output_tokens);
847844

848845
if (ctx_guidance) { llama_free(ctx_guidance); }
849846
llama_free(ctx);

examples/quantize-stats/quantize-stats.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ int main(int argc, char ** argv) {
312312

313313
lparams.n_ctx = 256;
314314
lparams.seed = 1;
315-
lparams.f16_kv = false;
315+
lparams.kv_type = GGML_TYPE_F32;
316316
lparams.use_mlock = false;
317317

318318
model = llama_load_model_from_file(params.model.c_str(), lparams);

examples/save-load-state/save-load-state.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ int main(int argc, char ** argv) {
2626

2727
lparams.n_ctx = params.n_ctx;
2828
lparams.seed = params.seed;
29-
lparams.f16_kv = params.memory_f16;
29+
lparams.kv_type = params.kv_type;
3030
lparams.use_mmap = params.use_mmap;
3131
lparams.use_mlock = params.use_mlock;
3232

0 commit comments

Comments
 (0)