@@ -225,6 +225,9 @@ struct cmd_params {
225
225
std::vector<ggml_type> type_k;
226
226
std::vector<ggml_type> type_v;
227
227
std::vector<int > n_threads;
228
+ std::vector<std::string> cpu_mask;
229
+ std::vector<bool > cpu_strict;
230
+ std::vector<int > poll;
228
231
std::vector<int > n_gpu_layers;
229
232
std::vector<std::string> rpc_servers;
230
233
std::vector<llama_split_mode> split_mode;
@@ -235,8 +238,8 @@ struct cmd_params {
235
238
std::vector<bool > use_mmap;
236
239
std::vector<bool > embeddings;
237
240
ggml_numa_strategy numa;
238
- cpu_params cpuparams;
239
241
int reps;
242
+ int prio;
240
243
bool verbose;
241
244
output_formats output_format;
242
245
output_formats output_format_stderr;
@@ -252,6 +255,9 @@ static const cmd_params cmd_params_defaults = {
252
255
/* type_k */ {GGML_TYPE_F16},
253
256
/* type_v */ {GGML_TYPE_F16},
254
257
/* n_threads */ {cpu_get_num_math ()},
258
+ /* cpu_mask */ {" 0x0" },
259
+ /* cpu_strict */ {false },
260
+ /* poll */ {50 },
255
261
/* n_gpu_layers */ {99 },
256
262
/* rpc_servers */ {" " },
257
263
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@@ -262,8 +268,8 @@ static const cmd_params cmd_params_defaults = {
262
268
/* use_mmap */ {true },
263
269
/* embeddings */ {false },
264
270
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
265
- /* cpuparams */ {},
266
271
/* reps */ 5 ,
272
+ /* prio */ 0 ,
267
273
/* verbose */ false ,
268
274
/* output_format */ MARKDOWN,
269
275
/* output_format_stderr */ NONE,
@@ -283,6 +289,9 @@ static void print_usage(int /* argc */, char ** argv) {
283
289
printf (" -ctk, --cache-type-k <t> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.type_k , ggml_type_name), " ," ).c_str ());
284
290
printf (" -ctv, --cache-type-v <t> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.type_v , ggml_type_name), " ," ).c_str ());
285
291
printf (" -t, --threads <n> (default: %s)\n " , join (cmd_params_defaults.n_threads , " ," ).c_str ());
292
+ printf (" -C, --cpu-mask <hex,hex> (default: %s)\n " , join (cmd_params_defaults.cpu_mask , " ," ).c_str ());
293
+ printf (" --cpu-strict <0|1> (default: %s)\n " , join (cmd_params_defaults.cpu_strict , " ," ).c_str ());
294
+ printf (" --poll <0...100> (default: %s)\n " , join (cmd_params_defaults.poll , " ," ).c_str ());
286
295
printf (" -ngl, --n-gpu-layers <n> (default: %s)\n " , join (cmd_params_defaults.n_gpu_layers , " ," ).c_str ());
287
296
printf (" -rpc, --rpc <rpc_servers> (default: %s)\n " , join (cmd_params_defaults.rpc_servers , " ," ).c_str ());
288
297
printf (" -sm, --split-mode <none|layer|row> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.split_mode , split_mode_str), " ," ).c_str ());
@@ -291,13 +300,10 @@ static void print_usage(int /* argc */, char ** argv) {
291
300
printf (" -fa, --flash-attn <0|1> (default: %s)\n " , join (cmd_params_defaults.flash_attn , " ," ).c_str ());
292
301
printf (" -mmp, --mmap <0|1> (default: %s)\n " , join (cmd_params_defaults.use_mmap , " ," ).c_str ());
293
302
printf (" --numa <distribute|isolate|numactl> (default: disabled)\n " );
294
- printf (" -C, --cpu-mask <hex> (default: 0x0)\n " );
295
- printf (" --cpu-strict <0|1> (default: %d)\n " , cmd_params_defaults.cpuparams .strict_cpu );
296
- printf (" --priority <0|1|2|3> (default: %d)\n " , cmd_params_defaults.cpuparams .priority );
297
- printf (" --poll <0|1> (default: %d)\n " , cmd_params_defaults.cpuparams .poll );
298
303
printf (" -embd, --embeddings <0|1> (default: %s)\n " , join (cmd_params_defaults.embeddings , " ," ).c_str ());
299
304
printf (" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n " );
300
305
printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
306
+ printf (" --prio <0|1|2|3> (default: %d)\n " , cmd_params_defaults.prio );
301
307
printf (" -o, --output <csv|json|md|sql> (default: %s)\n " , output_format_str (cmd_params_defaults.output_format ));
302
308
printf (" -oe, --output-err <csv|json|md|sql> (default: %s)\n " , output_format_str (cmd_params_defaults.output_format_stderr ));
303
309
printf (" -v, --verbose (default: %s)\n " , cmd_params_defaults.verbose ? " 1" : " 0" );
@@ -344,6 +350,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
344
350
params.output_format_stderr = cmd_params_defaults.output_format_stderr ;
345
351
params.reps = cmd_params_defaults.reps ;
346
352
params.numa = cmd_params_defaults.numa ;
353
+ params.prio = cmd_params_defaults.prio ;
347
354
348
355
for (int i = 1 ; i < argc; i++) {
349
356
arg = argv[i];
@@ -439,6 +446,33 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
439
446
}
440
447
auto p = string_split<int >(argv[i], split_delim);
441
448
params.n_threads .insert (params.n_threads .end (), p.begin (), p.end ());
449
+ } else if (arg == " -C" || arg == " --cpu-mask" ) {
450
+ if (++i >= argc) {
451
+ invalid_param = true ;
452
+ break ;
453
+ }
454
+ auto p = string_split<std::string>(argv[i], split_delim);
455
+ params.cpu_mask .insert (params.cpu_mask .end (), p.begin (), p.end ());
456
+ } else if (arg == " --cpu-strict" ) {
457
+ if (++i >= argc) {
458
+ invalid_param = true ;
459
+ break ;
460
+ }
461
+ auto p = string_split<bool >(argv[i], split_delim);
462
+ params.cpu_strict .insert (params.cpu_strict .end (), p.begin (), p.end ());
463
+ } else if (arg == " --poll" ) {
464
+ if (++i >= argc) {
465
+ invalid_param = true ;
466
+ break ;
467
+ }
468
+ auto p = string_split<int >(argv[i], split_delim);
469
+ params.poll .insert (params.poll .end (), p.begin (), p.end ());
470
+ } else if (arg == " --prio" ) {
471
+ if (++i >= argc) {
472
+ invalid_param = true ;
473
+ break ;
474
+ }
475
+ params.prio = std::stoi (argv[i]);
442
476
} else if (arg == " -ngl" || arg == " --n-gpu-layers" ) {
443
477
if (++i >= argc) {
444
478
invalid_param = true ;
@@ -498,32 +532,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
498
532
else if (value == " numactl" ) { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
499
533
else { invalid_param = true ; break ; }
500
534
}
501
- } else if (arg == " -C" || arg == " --cpu-mask" ) {
502
- if (++i >= argc) {
503
- invalid_param = true ;
504
- break ;
505
- }
506
- std::string mask = argv[i];
507
- params.cpuparams .mask_valid = true ;
508
- invalid_param = !parse_cpu_mask (mask, params.cpuparams .cpumask );
509
- } else if (arg == " --prio" ) {
510
- if (++i >= argc) {
511
- invalid_param = true ;
512
- break ;
513
- }
514
- params.cpuparams .priority = std::stoul (argv[i]);
515
- } else if (arg == " --cpu-strict" ) {
516
- if (++i >= argc) {
517
- invalid_param = true ;
518
- break ;
519
- }
520
- params.cpuparams .strict_cpu = std::stoul (argv[i]);
521
- } else if (arg == " --poll" ) {
522
- if (++i >= argc) {
523
- invalid_param = true ;
524
- break ;
525
- }
526
- params.cpuparams .poll = std::stoul (argv[i]);
527
535
} else if (arg == " -fa" || arg == " --flash-attn" ) {
528
536
if (++i >= argc) {
529
537
invalid_param = true ;
@@ -617,6 +625,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
617
625
if (params.use_mmap .empty ()) { params.use_mmap = cmd_params_defaults.use_mmap ; }
618
626
if (params.embeddings .empty ()) { params.embeddings = cmd_params_defaults.embeddings ; }
619
627
if (params.n_threads .empty ()) { params.n_threads = cmd_params_defaults.n_threads ; }
628
+ if (params.cpu_mask .empty ()) { params.cpu_mask = cmd_params_defaults.cpu_mask ; }
629
+ if (params.cpu_strict .empty ()) { params.cpu_strict = cmd_params_defaults.cpu_strict ; }
630
+ if (params.poll .empty ()) { params.poll = cmd_params_defaults.poll ; }
620
631
621
632
return params;
622
633
}
@@ -630,6 +641,9 @@ struct cmd_params_instance {
630
641
ggml_type type_k;
631
642
ggml_type type_v;
632
643
int n_threads;
644
+ std::string cpu_mask;
645
+ bool cpu_strict;
646
+ int poll;
633
647
int n_gpu_layers;
634
648
std::string rpc_servers;
635
649
llama_split_mode split_mode;
@@ -699,7 +713,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
699
713
for (const auto & tv : params.type_v )
700
714
for (const auto & nkvo : params.no_kv_offload )
701
715
for (const auto & fa : params.flash_attn )
702
- for (const auto & nt : params.n_threads ) {
716
+ for (const auto & nt : params.n_threads )
717
+ for (const auto & cm : params.cpu_mask )
718
+ for (const auto & cs : params.cpu_strict )
719
+ for (const auto & pl : params.poll ) {
703
720
for (const auto & n_prompt : params.n_prompt ) {
704
721
if (n_prompt == 0 ) {
705
722
continue ;
@@ -713,6 +730,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
713
730
/* .type_k = */ tk,
714
731
/* .type_v = */ tv,
715
732
/* .n_threads = */ nt,
733
+ /* .cpu_mask = */ cm,
734
+ /* .cpu_strict = */ cs,
735
+ /* .poll = */ pl,
716
736
/* .n_gpu_layers = */ nl,
717
737
/* .rpc_servers = */ rpc,
718
738
/* .split_mode = */ sm,
@@ -739,6 +759,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
739
759
/* .type_k = */ tk,
740
760
/* .type_v = */ tv,
741
761
/* .n_threads = */ nt,
762
+ /* .cpu_mask = */ cm,
763
+ /* .cpu_strict = */ cs,
764
+ /* .poll = */ pl,
742
765
/* .n_gpu_layers = */ nl,
743
766
/* .rpc_servers = */ rpc,
744
767
/* .split_mode = */ sm,
@@ -765,6 +788,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
765
788
/* .type_k = */ tk,
766
789
/* .type_v = */ tv,
767
790
/* .n_threads = */ nt,
791
+ /* .cpu_mask = */ cm,
792
+ /* .cpu_strict = */ cs,
793
+ /* .poll = */ pl,
768
794
/* .n_gpu_layers = */ nl,
769
795
/* .rpc_servers = */ rpc,
770
796
/* .split_mode = */ sm,
@@ -801,6 +827,9 @@ struct test {
801
827
int n_batch;
802
828
int n_ubatch;
803
829
int n_threads;
830
+ std::string cpu_mask;
831
+ bool cpu_strict;
832
+ int poll;
804
833
bool has_rpc;
805
834
ggml_type type_k;
806
835
ggml_type type_v;
@@ -827,6 +856,9 @@ struct test {
827
856
n_batch = inst.n_batch ;
828
857
n_ubatch = inst.n_ubatch ;
829
858
n_threads = inst.n_threads ;
859
+ cpu_mask = inst.cpu_mask ;
860
+ cpu_strict = inst.cpu_strict ;
861
+ poll = inst.poll ;
830
862
has_rpc = !inst.rpc_servers .empty ();
831
863
type_k = inst.type_k ;
832
864
type_v = inst.type_v ;
@@ -904,13 +936,14 @@ struct test {
904
936
" cpu_info" , " gpu_info" ,
905
937
" model_filename" , " model_type" , " model_size" , " model_n_params" ,
906
938
" n_batch" , " n_ubatch" ,
907
- " n_threads" , " type_k" , " type_v" ,
939
+ " n_threads" , " cpu_mask" , " cpu_strict" , " poll" ,
940
+ " type_k" , " type_v" ,
908
941
" n_gpu_layers" , " split_mode" ,
909
942
" main_gpu" , " no_kv_offload" , " flash_attn" ,
910
943
" tensor_split" , " use_mmap" , " embeddings" ,
911
944
" n_prompt" , " n_gen" , " test_time" ,
912
945
" avg_ns" , " stddev_ns" ,
913
- " avg_ts" , " stddev_ts"
946
+ " avg_ts" , " stddev_ts" ,
914
947
};
915
948
return fields;
916
949
}
@@ -919,7 +952,7 @@ struct test {
919
952
920
953
static field_type get_field_type (const std::string & field) {
921
954
if (field == " build_number" || field == " n_batch" || field == " n_ubatch" ||
922
- field == " n_threads" ||
955
+ field == " n_threads" || field == " poll " ||
923
956
field == " model_size" || field == " model_n_params" ||
924
957
field == " n_gpu_layers" || field == " main_gpu" ||
925
958
field == " n_prompt" || field == " n_gen" ||
@@ -928,6 +961,7 @@ struct test {
928
961
}
929
962
if (field == " cuda" || field == " vulkan" || field == " kompute" || field == " metal" ||
930
963
field == " gpu_blas" || field == " blas" || field == " sycl" ||field == " f16_kv" || field == " no_kv_offload" ||
964
+ field == " cpu_strict" ||
931
965
field == " flash_attn" || field == " use_mmap" || field == " embeddings" ) {
932
966
return BOOL;
933
967
}
@@ -960,7 +994,8 @@ struct test {
960
994
cpu_info, gpu_info,
961
995
model_filename, model_type, std::to_string (model_size), std::to_string (model_n_params),
962
996
std::to_string (n_batch), std::to_string (n_ubatch),
963
- std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v),
997
+ std::to_string (n_threads), cpu_mask, std::to_string (cpu_strict), std::to_string (poll),
998
+ ggml_type_name (type_k), ggml_type_name (type_v),
964
999
std::to_string (n_gpu_layers), split_mode_str (split_mode),
965
1000
std::to_string (main_gpu), std::to_string (no_kv_offload), std::to_string (flash_attn),
966
1001
tensor_split_str, std::to_string (use_mmap), std::to_string (embeddings),
@@ -1099,7 +1134,7 @@ struct markdown_printer : public printer {
1099
1134
return -30 ;
1100
1135
}
1101
1136
if (field == " t/s" ) {
1102
- return 16 ;
1137
+ return 20 ;
1103
1138
}
1104
1139
if (field == " size" || field == " params" ) {
1105
1140
return 10 ;
@@ -1181,6 +1216,15 @@ struct markdown_printer : public printer {
1181
1216
if (params.n_threads .size () > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
1182
1217
fields.emplace_back (" n_threads" );
1183
1218
}
1219
+ if (params.cpu_mask .size () > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask ) {
1220
+ fields.emplace_back (" cpu_mask" );
1221
+ }
1222
+ if (params.cpu_strict .size () > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict ) {
1223
+ fields.emplace_back (" cpu_strict" );
1224
+ }
1225
+ if (params.poll .size () > 1 || params.poll != cmd_params_defaults.poll ) {
1226
+ fields.emplace_back (" poll" );
1227
+ }
1184
1228
if (params.n_batch .size () > 1 || params.n_batch != cmd_params_defaults.n_batch ) {
1185
1229
fields.emplace_back (" n_batch" );
1186
1230
}
@@ -1434,8 +1478,6 @@ int main(int argc, char ** argv) {
1434
1478
llama_model * lmodel = nullptr ;
1435
1479
const cmd_params_instance * prev_inst = nullptr ;
1436
1480
1437
- postprocess_cpu_params (params.cpuparams );
1438
-
1439
1481
for (const auto & inst : params_instances) {
1440
1482
// keep the same model between tests when possible
1441
1483
if (!lmodel || !prev_inst || !inst.equal_mparams (*prev_inst)) {
@@ -1463,12 +1505,13 @@ int main(int argc, char ** argv) {
1463
1505
llama_kv_cache_clear (ctx);
1464
1506
1465
1507
struct ggml_threadpool_params tpp = ggml_threadpool_params_default (t.n_threads );
1466
- tpp.strict_cpu = params.cpuparams .strict_cpu ;
1467
- tpp.prio = params.cpuparams .priority ;
1468
- tpp.poll = params.cpuparams .poll ;
1469
- if (params.cpuparams .mask_valid ) {
1470
- std::memcpy (&tpp.cpumask [0 ], ¶ms.cpuparams .cpumask [0 ], GGML_MAX_N_THREADS);
1508
+ if (!parse_cpu_mask (t.cpu_mask , tpp.cpumask )) {
1509
+ LOG_TEE (" %s: failed to parse cpu-mask: %s\n " , __func__, t.cpu_mask .c_str ());
1510
+ exit (1 );
1471
1511
}
1512
+ tpp.strict_cpu = t.cpu_strict ;
1513
+ tpp.poll = t.poll ;
1514
+ tpp.prio = params.prio ;
1472
1515
1473
1516
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool (&tpp);
1474
1517
if (!threadpool) {
0 commit comments