@@ -54,6 +54,13 @@ static std::vector<T> split(const std::string & str, char delim) {
54
54
return values;
55
55
}
56
56
57
+ template <typename T, typename F>
58
+ static std::vector<std::string> transform_to_str (const std::vector<T> & values, F f) {
59
+ std::vector<std::string> str_values;
60
+ std::transform (values.begin (), values.end (), std::back_inserter (str_values), f);
61
+ return str_values;
62
+ }
63
+
57
64
template <typename T>
58
65
static T avg (const std::vector<T> & v) {
59
66
if (v.empty ()) {
@@ -127,7 +134,8 @@ struct cmd_params {
127
134
std::vector<int > n_prompt;
128
135
std::vector<int > n_gen;
129
136
std::vector<int > n_batch;
130
- std::vector<bool > f32_kv;
137
+ std::vector<ggml_type> type_k;
138
+ std::vector<ggml_type> type_v;
131
139
std::vector<int > n_threads;
132
140
std::vector<int > n_gpu_layers;
133
141
std::vector<int > main_gpu;
@@ -143,7 +151,8 @@ static const cmd_params cmd_params_defaults = {
143
151
/* n_prompt */ {512 },
144
152
/* n_gen */ {128 },
145
153
/* n_batch */ {512 },
146
- /* f32_kv */ {false },
154
+ /* type_k */ {GGML_TYPE_F16},
155
+ /* type_v */ {GGML_TYPE_F16},
147
156
/* n_threads */ {get_num_physical_cores ()},
148
157
/* n_gpu_layers */ {99 },
149
158
/* main_gpu */ {0 },
@@ -163,7 +172,8 @@ static void print_usage(int /* argc */, char ** argv) {
163
172
printf (" -p, --n-prompt <n> (default: %s)\n " , join (cmd_params_defaults.n_prompt , " ," ).c_str ());
164
173
printf (" -n, --n-gen <n> (default: %s)\n " , join (cmd_params_defaults.n_gen , " ," ).c_str ());
165
174
printf (" -b, --batch-size <n> (default: %s)\n " , join (cmd_params_defaults.n_batch , " ," ).c_str ());
166
- printf (" --memory-f32 <0|1> (default: %s)\n " , join (cmd_params_defaults.f32_kv , " ," ).c_str ());
175
+ printf (" -ctk <t>, --cache-type-k <t> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.type_k , ggml_type_name), " ," ).c_str ());
176
+ printf (" -ctv <t>, --cache-type-v <t> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.type_v , ggml_type_name), " ," ).c_str ());
167
177
printf (" -t, --threads <n> (default: %s)\n " , join (cmd_params_defaults.n_threads , " ," ).c_str ());
168
178
printf (" -ngl, --n-gpu-layers <n> (default: %s)\n " , join (cmd_params_defaults.n_gpu_layers , " ," ).c_str ());
169
179
printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
@@ -174,9 +184,32 @@ static void print_usage(int /* argc */, char ** argv) {
174
184
printf (" -v, --verbose (default: %s)\n " , cmd_params_defaults.verbose ? " 1" : " 0" );
175
185
printf (" \n " );
176
186
printf (" Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n " );
187
+ }
177
188
189
+ static ggml_type ggml_type_from_name (const std::string & s) {
190
+ if (s == " f16" ) {
191
+ return GGML_TYPE_F16;
192
+ }
193
+ if (s == " q8_0" ) {
194
+ return GGML_TYPE_Q8_0;
195
+ }
196
+ if (s == " q4_0" ) {
197
+ return GGML_TYPE_Q4_0;
198
+ }
199
+ if (s == " q4_1" ) {
200
+ return GGML_TYPE_Q4_1;
201
+ }
202
+ if (s == " q5_0" ) {
203
+ return GGML_TYPE_Q5_0;
204
+ }
205
+ if (s == " q5_1" ) {
206
+ return GGML_TYPE_Q5_1;
207
+ }
208
+
209
+ return GGML_TYPE_COUNT;
178
210
}
179
211
212
+
180
213
static cmd_params parse_cmd_params (int argc, char ** argv) {
181
214
cmd_params params;
182
215
std::string arg;
@@ -225,13 +258,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
225
258
}
226
259
auto p = split<int >(argv[i], split_delim);
227
260
params.n_batch .insert (params.n_batch .end (), p.begin (), p.end ());
228
- } else if (arg == " --memory-f32 " ) {
261
+ } else if (arg == " -ctk " || arg == " --cache-type-k " ) {
229
262
if (++i >= argc) {
230
263
invalid_param = true ;
231
264
break ;
232
265
}
233
- auto p = split<int >(argv[i], split_delim);
234
- params.f32_kv .insert (params.f32_kv .end (), p.begin (), p.end ());
266
+ auto p = split<std::string>(argv[i], split_delim);
267
+ std::vector<ggml_type> types;
268
+ for (const auto & t : p) {
269
+ ggml_type gt = ggml_type_from_name (t);
270
+ if (gt == GGML_TYPE_COUNT) {
271
+ invalid_param = true ;
272
+ break ;
273
+ }
274
+ types.push_back (gt);
275
+ }
276
+ params.type_k .insert (params.type_k .end (), types.begin (), types.end ());
277
+ } else if (arg == " -ctv" || arg == " --cache-type-v" ) {
278
+ if (++i >= argc) {
279
+ invalid_param = true ;
280
+ break ;
281
+ }
282
+ auto p = split<std::string>(argv[i], split_delim);
283
+ std::vector<ggml_type> types;
284
+ for (const auto & t : p) {
285
+ ggml_type gt = ggml_type_from_name (t);
286
+ if (gt == GGML_TYPE_COUNT) {
287
+ invalid_param = true ;
288
+ break ;
289
+ }
290
+ types.push_back (gt);
291
+ }
292
+ params.type_v .insert (params.type_v .end (), types.begin (), types.end ());
235
293
} else if (arg == " -t" || arg == " --threads" ) {
236
294
if (++i >= argc) {
237
295
invalid_param = true ;
@@ -322,7 +380,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
322
380
if (params.n_prompt .empty ()) { params.n_prompt = cmd_params_defaults.n_prompt ; }
323
381
if (params.n_gen .empty ()) { params.n_gen = cmd_params_defaults.n_gen ; }
324
382
if (params.n_batch .empty ()) { params.n_batch = cmd_params_defaults.n_batch ; }
325
- if (params.f32_kv .empty ()) { params.f32_kv = cmd_params_defaults.f32_kv ; }
383
+ if (params.type_k .empty ()) { params.type_k = cmd_params_defaults.type_k ; }
384
+ if (params.type_v .empty ()) { params.type_v = cmd_params_defaults.type_v ; }
326
385
if (params.n_gpu_layers .empty ()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers ; }
327
386
if (params.main_gpu .empty ()) { params.main_gpu = cmd_params_defaults.main_gpu ; }
328
387
if (params.mul_mat_q .empty ()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q ; }
@@ -337,7 +396,8 @@ struct cmd_params_instance {
337
396
int n_prompt;
338
397
int n_gen;
339
398
int n_batch;
340
- bool f32_kv;
399
+ ggml_type type_k;
400
+ ggml_type type_v;
341
401
int n_threads;
342
402
int n_gpu_layers;
343
403
int main_gpu;
@@ -366,7 +426,8 @@ struct cmd_params_instance {
366
426
367
427
cparams.n_ctx = n_prompt + n_gen;
368
428
cparams.n_batch = n_batch;
369
- cparams.f16_kv = !f32_kv;
429
+ cparams.type_k = type_k;
430
+ cparams.type_v = type_v;
370
431
cparams.mul_mat_q = mul_mat_q;
371
432
372
433
return cparams;
@@ -381,15 +442,17 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
381
442
for (const auto & mg : params.main_gpu )
382
443
for (const auto & ts : params.tensor_split )
383
444
for (const auto & nb : params.n_batch )
384
- for (const auto & fk : params.f32_kv )
445
+ for (const auto & tk : params.type_k )
446
+ for (const auto & tv : params.type_v )
385
447
for (const auto & mmq : params.mul_mat_q )
386
448
for (const auto & nt : params.n_threads ) {
387
449
cmd_params_instance instance = {
388
450
/* .model = */ m,
389
451
/* .n_prompt = */ n_prompt,
390
452
/* .n_gen = */ n_gen,
391
453
/* .n_batch = */ nb,
392
- /* .f32_kv = */ fk,
454
+ /* .type_k = */ tk,
455
+ /* .type_v = */ tv,
393
456
/* .n_threads = */ nt,
394
457
/* .n_gpu_layers = */ nl,
395
458
/* .main_gpu = */ mg,
@@ -411,7 +474,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
411
474
for (const auto & mg : params.main_gpu )
412
475
for (const auto & ts : params.tensor_split )
413
476
for (const auto & nb : params.n_batch )
414
- for (const auto & fk : params.f32_kv )
477
+ for (const auto & tk : params.type_k )
478
+ for (const auto & tv : params.type_v )
415
479
for (const auto & mmq : params.mul_mat_q )
416
480
for (const auto & nt : params.n_threads ) {
417
481
for (const auto & n_prompt : params.n_prompt ) {
@@ -423,7 +487,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
423
487
/* .n_prompt = */ n_prompt,
424
488
/* .n_gen = */ 0 ,
425
489
/* .n_batch = */ nb,
426
- /* .f32_kv = */ fk,
490
+ /* .type_k = */ tk,
491
+ /* .type_v = */ tv,
427
492
/* .n_threads = */ nt,
428
493
/* .n_gpu_layers = */ nl,
429
494
/* .main_gpu = */ mg,
@@ -442,7 +507,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
442
507
/* .n_prompt = */ 0 ,
443
508
/* .n_gen = */ n_gen,
444
509
/* .n_batch = */ nb,
445
- /* .f32_kv = */ fk,
510
+ /* .type_k = */ tk,
511
+ /* .type_v = */ tv,
446
512
/* .n_threads = */ nt,
447
513
/* .n_gpu_layers = */ nl,
448
514
/* .main_gpu = */ mg,
@@ -490,7 +556,8 @@ struct test {
490
556
uint64_t model_n_params;
491
557
int n_batch;
492
558
int n_threads;
493
- bool f32_kv;
559
+ ggml_type type_k;
560
+ ggml_type type_v;
494
561
int n_gpu_layers;
495
562
int main_gpu;
496
563
bool mul_mat_q;
@@ -509,7 +576,8 @@ struct test {
509
576
model_n_params = llama_model_n_params (lmodel);
510
577
n_batch = inst.n_batch ;
511
578
n_threads = inst.n_threads ;
512
- f32_kv = inst.f32_kv ;
579
+ type_k = inst.type_k ;
580
+ type_v = inst.type_v ;
513
581
n_gpu_layers = inst.n_gpu_layers ;
514
582
main_gpu = inst.main_gpu ;
515
583
mul_mat_q = inst.mul_mat_q ;
@@ -572,7 +640,7 @@ struct test {
572
640
" cuda" , " opencl" , " metal" , " gpu_blas" , " blas" ,
573
641
" cpu_info" , " gpu_info" ,
574
642
" model_filename" , " model_type" , " model_size" , " model_n_params" ,
575
- " n_batch" , " n_threads" , " f16_kv " ,
643
+ " n_batch" , " n_threads" , " type_k " , " type_v " ,
576
644
" n_gpu_layers" , " main_gpu" , " mul_mat_q" , " tensor_split" ,
577
645
" n_prompt" , " n_gen" , " test_time" ,
578
646
" avg_ns" , " stddev_ns" ,
@@ -622,7 +690,7 @@ struct test {
622
690
std::to_string (cuda), std::to_string (opencl), std::to_string (metal), std::to_string (gpu_blas), std::to_string (blas),
623
691
cpu_info, gpu_info,
624
692
model_filename, model_type, std::to_string (model_size), std::to_string (model_n_params),
625
- std::to_string (n_batch), std::to_string (n_threads), std::to_string (!f32_kv ),
693
+ std::to_string (n_batch), std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v ),
626
694
std::to_string (n_gpu_layers), std::to_string (main_gpu), std::to_string (mul_mat_q), tensor_split_str,
627
695
std::to_string (n_prompt), std::to_string (n_gen), test_time,
628
696
std::to_string (avg_ns ()), std::to_string (stdev_ns ()),
@@ -806,8 +874,11 @@ struct markdown_printer : public printer {
806
874
if (params.n_batch .size () > 1 || params.n_batch != cmd_params_defaults.n_batch ) {
807
875
fields.push_back (" n_batch" );
808
876
}
809
- if (params.f32_kv .size () > 1 || params.f32_kv != cmd_params_defaults.f32_kv ) {
810
- fields.push_back (" f16_kv" );
877
+ if (params.type_k .size () > 1 || params.type_k != cmd_params_defaults.type_k ) {
878
+ fields.push_back (" type_k" );
879
+ }
880
+ if (params.type_v .size () > 1 || params.type_v != cmd_params_defaults.type_v ) {
881
+ fields.push_back (" type_v" );
811
882
}
812
883
if (params.main_gpu .size () > 1 || params.main_gpu != cmd_params_defaults.main_gpu ) {
813
884
fields.push_back (" main_gpu" );
0 commit comments