@@ -173,6 +173,7 @@ struct cmd_params {
173
173
std::vector<bool > no_kv_offload;
174
174
std::vector<std::vector<float >> tensor_split;
175
175
std::vector<bool > use_mmap;
176
+ std::vector<bool > embeddings;
176
177
int reps;
177
178
bool verbose;
178
179
output_formats output_format;
@@ -192,6 +193,7 @@ static const cmd_params cmd_params_defaults = {
192
193
/* no_kv_offload */ {false },
193
194
/* tensor_split */ {std::vector<float >(llama_max_devices (), 0 .0f )},
194
195
/* use_mmap */ {true },
196
+ /* embeddings */ {false },
195
197
/* reps */ 5 ,
196
198
/* verbose */ false ,
197
199
/* output_format */ MARKDOWN
@@ -214,6 +216,7 @@ static void print_usage(int /* argc */, char ** argv) {
214
216
printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
215
217
printf (" -nkvo, --no-kv-offload <0|1> (default: %s)\n " , join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
216
218
printf (" -mmp, --mmap <0|1> (default: %s)\n " , join (cmd_params_defaults.use_mmap , " ," ).c_str ());
219
+ printf (" -embd, --embeddings <0|1> (default: %s)\n " , join (cmd_params_defaults.embeddings , " ," ).c_str ());
217
220
printf (" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n " );
218
221
printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
219
222
printf (" -o, --output <csv|json|md|sql> (default: %s)\n " , output_format_str (cmd_params_defaults.output_format ));
@@ -382,6 +385,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
382
385
}
383
386
auto p = split<bool >(argv[i], split_delim);
384
387
params.use_mmap .insert (params.use_mmap .end (), p.begin (), p.end ());
388
+ } else if (arg == " -embd" || arg == " --embeddings" ) {
389
+ if (++i >= argc) {
390
+ invalid_param = true ;
391
+ break ;
392
+ }
393
+ auto p = split<bool >(argv[i], split_delim);
394
+ params.embeddings .insert (params.embeddings .end (), p.begin (), p.end ());
385
395
} else if (arg == " -ts" || arg == " --tensor-split" ) {
386
396
if (++i >= argc) {
387
397
invalid_param = true ;
@@ -453,6 +463,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
453
463
if (params.no_kv_offload .empty ()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload ; }
454
464
if (params.tensor_split .empty ()) { params.tensor_split = cmd_params_defaults.tensor_split ; }
455
465
if (params.use_mmap .empty ()) { params.use_mmap = cmd_params_defaults.use_mmap ; }
466
+ if (params.embeddings .empty ()) { params.embeddings = cmd_params_defaults.embeddings ; }
456
467
if (params.n_threads .empty ()) { params.n_threads = cmd_params_defaults.n_threads ; }
457
468
458
469
return params;
@@ -472,6 +483,7 @@ struct cmd_params_instance {
472
483
bool no_kv_offload;
473
484
std::vector<float > tensor_split;
474
485
bool use_mmap;
486
+ bool embeddings;
475
487
476
488
llama_model_params to_llama_mparams () const {
477
489
llama_model_params mparams = llama_model_default_params ();
@@ -502,6 +514,7 @@ struct cmd_params_instance {
502
514
cparams.type_k = type_k;
503
515
cparams.type_v = type_v;
504
516
cparams.offload_kqv = !no_kv_offload;
517
+ cparams.embeddings = embeddings;
505
518
506
519
return cparams;
507
520
}
@@ -517,6 +530,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
517
530
for (const auto & mg : params.main_gpu )
518
531
for (const auto & ts : params.tensor_split )
519
532
for (const auto & mmp : params.use_mmap )
533
+ for (const auto & embd : params.embeddings )
520
534
for (const auto & nb : params.n_batch )
521
535
for (const auto & tk : params.type_k )
522
536
for (const auto & tv : params.type_v )
@@ -540,6 +554,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
540
554
/* .no_kv_offload= */ nkvo,
541
555
/* .tensor_split = */ ts,
542
556
/* .use_mmap = */ mmp,
557
+ /* .embeddings = */ embd,
543
558
};
544
559
instances.push_back (instance);
545
560
}
@@ -562,6 +577,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
562
577
/* .no_kv_offload= */ nkvo,
563
578
/* .tensor_split = */ ts,
564
579
/* .use_mmap = */ mmp,
580
+ /* .embeddings = */ embd,
565
581
};
566
582
instances.push_back (instance);
567
583
}
@@ -597,6 +613,7 @@ struct test {
597
613
bool no_kv_offload;
598
614
std::vector<float > tensor_split;
599
615
bool use_mmap;
616
+ bool embeddings;
600
617
int n_prompt;
601
618
int n_gen;
602
619
std::string test_time;
@@ -619,6 +636,7 @@ struct test {
619
636
no_kv_offload = inst.no_kv_offload ;
620
637
tensor_split = inst.tensor_split ;
621
638
use_mmap = inst.use_mmap ;
639
+ embeddings = inst.embeddings ;
622
640
n_prompt = inst.n_prompt ;
623
641
n_gen = inst.n_gen ;
624
642
// RFC 3339 date-time format
@@ -690,7 +708,7 @@ struct test {
690
708
" n_batch" , " n_threads" , " type_k" , " type_v" ,
691
709
" n_gpu_layers" , " split_mode" ,
692
710
" main_gpu" , " no_kv_offload" ,
693
- " tensor_split" , " use_mmap" ,
711
+ " tensor_split" , " use_mmap" , " embeddings " ,
694
712
" n_prompt" , " n_gen" , " test_time" ,
695
713
" avg_ns" , " stddev_ns" ,
696
714
" avg_ts" , " stddev_ts"
@@ -710,7 +728,7 @@ struct test {
710
728
}
711
729
if (field == " cuda" || field == " opencl" || field == " vulkan" || field == " kompute" || field == " metal" ||
712
730
field == " gpu_blas" || field == " blas" || field == " sycl" ||field == " f16_kv" || field == " no_kv_offload" ||
713
- field == " use_mmap" ) {
731
+ field == " use_mmap" || field == " embeddings " ) {
714
732
return BOOL;
715
733
}
716
734
if (field == " avg_ts" || field == " stddev_ts" ) {
@@ -744,7 +762,7 @@ struct test {
744
762
std::to_string (n_batch), std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v),
745
763
std::to_string (n_gpu_layers), split_mode_str (split_mode),
746
764
std::to_string (main_gpu), std::to_string (no_kv_offload),
747
- tensor_split_str, std::to_string (use_mmap),
765
+ tensor_split_str, std::to_string (use_mmap), std::to_string (embeddings),
748
766
std::to_string (n_prompt), std::to_string (n_gen), test_time,
749
767
std::to_string (avg_ns ()), std::to_string (stdev_ns ()),
750
768
std::to_string (avg_ts ()), std::to_string (stdev_ts ())
@@ -914,6 +932,9 @@ struct markdown_printer : public printer {
914
932
if (field == " use_mmap" ) {
915
933
return " mmap" ;
916
934
}
935
+ if (field == " embeddings" ) {
936
+ return " embd" ;
937
+ }
917
938
if (field == " tensor_split" ) {
918
939
return " ts" ;
919
940
}
@@ -957,6 +978,9 @@ struct markdown_printer : public printer {
957
978
if (params.use_mmap .size () > 1 || params.use_mmap != cmd_params_defaults.use_mmap ) {
958
979
fields.emplace_back (" use_mmap" );
959
980
}
981
+ if (params.embeddings .size () > 1 || params.embeddings != cmd_params_defaults.embeddings ) {
982
+ fields.emplace_back (" embeddings" );
983
+ }
960
984
fields.emplace_back (" test" );
961
985
fields.emplace_back (" t/s" );
962
986
0 commit comments