Skip to content

Commit 8d5ab9a

Browse files
max-krasnyanskyfmz
authored and
fmz
committed
llama-bench: turn threadpool params into vectors, add output headers, etc
1 parent 658f16c commit 8d5ab9a

File tree

1 file changed

+88
-45
lines changed

1 file changed

+88
-45
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 88 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,9 @@ struct cmd_params {
225225
std::vector<ggml_type> type_k;
226226
std::vector<ggml_type> type_v;
227227
std::vector<int> n_threads;
228+
std::vector<std::string> cpu_mask;
229+
std::vector<bool> cpu_strict;
230+
std::vector<int> poll;
228231
std::vector<int> n_gpu_layers;
229232
std::vector<std::string> rpc_servers;
230233
std::vector<llama_split_mode> split_mode;
@@ -235,8 +238,8 @@ struct cmd_params {
235238
std::vector<bool> use_mmap;
236239
std::vector<bool> embeddings;
237240
ggml_numa_strategy numa;
238-
cpu_params cpuparams;
239241
int reps;
242+
int prio;
240243
bool verbose;
241244
output_formats output_format;
242245
output_formats output_format_stderr;
@@ -252,6 +255,9 @@ static const cmd_params cmd_params_defaults = {
252255
/* type_k */ {GGML_TYPE_F16},
253256
/* type_v */ {GGML_TYPE_F16},
254257
/* n_threads */ {cpu_get_num_math()},
258+
/* cpu_mask */ {"0x0"},
259+
/* cpu_strict */ {false},
260+
/* poll */ {50},
255261
/* n_gpu_layers */ {99},
256262
/* rpc_servers */ {""},
257263
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@@ -262,8 +268,8 @@ static const cmd_params cmd_params_defaults = {
262268
/* use_mmap */ {true},
263269
/* embeddings */ {false},
264270
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
265-
/* cpuparams */ {},
266271
/* reps */ 5,
272+
/* prio */ 0,
267273
/* verbose */ false,
268274
/* output_format */ MARKDOWN,
269275
/* output_format_stderr */ NONE,
@@ -283,6 +289,9 @@ static void print_usage(int /* argc */, char ** argv) {
283289
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
284290
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
285291
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
292+
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
293+
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
294+
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
286295
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
287296
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
288297
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -291,13 +300,10 @@ static void print_usage(int /* argc */, char ** argv) {
291300
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
292301
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
293302
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
294-
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
295-
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
296-
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
297-
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
298303
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
299304
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
300305
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
306+
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
301307
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
302308
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
303309
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -344,6 +350,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
344350
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
345351
params.reps = cmd_params_defaults.reps;
346352
params.numa = cmd_params_defaults.numa;
353+
params.prio = cmd_params_defaults.prio;
347354

348355
for (int i = 1; i < argc; i++) {
349356
arg = argv[i];
@@ -439,6 +446,33 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
439446
}
440447
auto p = string_split<int>(argv[i], split_delim);
441448
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
449+
} else if (arg == "-C" || arg == "--cpu-mask") {
450+
if (++i >= argc) {
451+
invalid_param = true;
452+
break;
453+
}
454+
auto p = string_split<std::string>(argv[i], split_delim);
455+
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
456+
} else if (arg == "--cpu-strict") {
457+
if (++i >= argc) {
458+
invalid_param = true;
459+
break;
460+
}
461+
auto p = string_split<bool>(argv[i], split_delim);
462+
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
463+
} else if (arg == "--poll") {
464+
if (++i >= argc) {
465+
invalid_param = true;
466+
break;
467+
}
468+
auto p = string_split<int>(argv[i], split_delim);
469+
params.poll.insert(params.poll.end(), p.begin(), p.end());
470+
} else if (arg == "--prio") {
471+
if (++i >= argc) {
472+
invalid_param = true;
473+
break;
474+
}
475+
params.prio = std::stoi(argv[i]);
442476
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
443477
if (++i >= argc) {
444478
invalid_param = true;
@@ -498,32 +532,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
498532
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
499533
else { invalid_param = true; break; }
500534
}
501-
} else if (arg == "-C" || arg == "--cpu-mask") {
502-
if (++i >= argc) {
503-
invalid_param = true;
504-
break;
505-
}
506-
std::string mask = argv[i];
507-
params.cpuparams.mask_valid = true;
508-
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
509-
} else if (arg == "--prio") {
510-
if (++i >= argc) {
511-
invalid_param = true;
512-
break;
513-
}
514-
params.cpuparams.priority = std::stoul(argv[i]);
515-
} else if (arg == "--cpu-strict") {
516-
if (++i >= argc) {
517-
invalid_param = true;
518-
break;
519-
}
520-
params.cpuparams.strict_cpu = std::stoul(argv[i]);
521-
} else if (arg == "--poll") {
522-
if (++i >= argc) {
523-
invalid_param = true;
524-
break;
525-
}
526-
params.cpuparams.poll = std::stoul(argv[i]);
527535
} else if (arg == "-fa" || arg == "--flash-attn") {
528536
if (++i >= argc) {
529537
invalid_param = true;
@@ -617,6 +625,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
617625
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
618626
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
619627
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
628+
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
629+
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
630+
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
620631

621632
return params;
622633
}
@@ -630,6 +641,9 @@ struct cmd_params_instance {
630641
ggml_type type_k;
631642
ggml_type type_v;
632643
int n_threads;
644+
std::string cpu_mask;
645+
bool cpu_strict;
646+
int poll;
633647
int n_gpu_layers;
634648
std::string rpc_servers;
635649
llama_split_mode split_mode;
@@ -699,7 +713,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
699713
for (const auto & tv : params.type_v)
700714
for (const auto & nkvo : params.no_kv_offload)
701715
for (const auto & fa : params.flash_attn)
702-
for (const auto & nt : params.n_threads) {
716+
for (const auto & nt : params.n_threads)
717+
for (const auto & cm : params.cpu_mask)
718+
for (const auto & cs : params.cpu_strict)
719+
for (const auto & pl : params.poll) {
703720
for (const auto & n_prompt : params.n_prompt) {
704721
if (n_prompt == 0) {
705722
continue;
@@ -713,6 +730,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
713730
/* .type_k = */ tk,
714731
/* .type_v = */ tv,
715732
/* .n_threads = */ nt,
733+
/* .cpu_mask = */ cm,
734+
/* .cpu_strict = */ cs,
735+
/* .poll = */ pl,
716736
/* .n_gpu_layers = */ nl,
717737
/* .rpc_servers = */ rpc,
718738
/* .split_mode = */ sm,
@@ -739,6 +759,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
739759
/* .type_k = */ tk,
740760
/* .type_v = */ tv,
741761
/* .n_threads = */ nt,
762+
/* .cpu_mask = */ cm,
763+
/* .cpu_strict = */ cs,
764+
/* .poll = */ pl,
742765
/* .n_gpu_layers = */ nl,
743766
/* .rpc_servers = */ rpc,
744767
/* .split_mode = */ sm,
@@ -765,6 +788,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
765788
/* .type_k = */ tk,
766789
/* .type_v = */ tv,
767790
/* .n_threads = */ nt,
791+
/* .cpu_mask = */ cm,
792+
/* .cpu_strict = */ cs,
793+
/* .poll = */ pl,
768794
/* .n_gpu_layers = */ nl,
769795
/* .rpc_servers = */ rpc,
770796
/* .split_mode = */ sm,
@@ -801,6 +827,9 @@ struct test {
801827
int n_batch;
802828
int n_ubatch;
803829
int n_threads;
830+
std::string cpu_mask;
831+
bool cpu_strict;
832+
int poll;
804833
bool has_rpc;
805834
ggml_type type_k;
806835
ggml_type type_v;
@@ -827,6 +856,9 @@ struct test {
827856
n_batch = inst.n_batch;
828857
n_ubatch = inst.n_ubatch;
829858
n_threads = inst.n_threads;
859+
cpu_mask = inst.cpu_mask;
860+
cpu_strict = inst.cpu_strict;
861+
poll = inst.poll;
830862
has_rpc = !inst.rpc_servers.empty();
831863
type_k = inst.type_k;
832864
type_v = inst.type_v;
@@ -904,13 +936,14 @@ struct test {
904936
"cpu_info", "gpu_info",
905937
"model_filename", "model_type", "model_size", "model_n_params",
906938
"n_batch", "n_ubatch",
907-
"n_threads", "type_k", "type_v",
939+
"n_threads", "cpu_mask", "cpu_strict", "poll",
940+
"type_k", "type_v",
908941
"n_gpu_layers", "split_mode",
909942
"main_gpu", "no_kv_offload", "flash_attn",
910943
"tensor_split", "use_mmap", "embeddings",
911944
"n_prompt", "n_gen", "test_time",
912945
"avg_ns", "stddev_ns",
913-
"avg_ts", "stddev_ts"
946+
"avg_ts", "stddev_ts",
914947
};
915948
return fields;
916949
}
@@ -919,7 +952,7 @@ struct test {
919952

920953
static field_type get_field_type(const std::string & field) {
921954
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
922-
field == "n_threads" ||
955+
field == "n_threads" || field == "poll" ||
923956
field == "model_size" || field == "model_n_params" ||
924957
field == "n_gpu_layers" || field == "main_gpu" ||
925958
field == "n_prompt" || field == "n_gen" ||
@@ -928,6 +961,7 @@ struct test {
928961
}
929962
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
930963
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
964+
field == "cpu_strict" ||
931965
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
932966
return BOOL;
933967
}
@@ -960,7 +994,8 @@ struct test {
960994
cpu_info, gpu_info,
961995
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
962996
std::to_string(n_batch), std::to_string(n_ubatch),
963-
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
997+
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
998+
ggml_type_name(type_k), ggml_type_name(type_v),
964999
std::to_string(n_gpu_layers), split_mode_str(split_mode),
9651000
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
9661001
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -1099,7 +1134,7 @@ struct markdown_printer : public printer {
10991134
return -30;
11001135
}
11011136
if (field == "t/s") {
1102-
return 16;
1137+
return 20;
11031138
}
11041139
if (field == "size" || field == "params") {
11051140
return 10;
@@ -1181,6 +1216,15 @@ struct markdown_printer : public printer {
11811216
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
11821217
fields.emplace_back("n_threads");
11831218
}
1219+
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
1220+
fields.emplace_back("cpu_mask");
1221+
}
1222+
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
1223+
fields.emplace_back("cpu_strict");
1224+
}
1225+
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
1226+
fields.emplace_back("poll");
1227+
}
11841228
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
11851229
fields.emplace_back("n_batch");
11861230
}
@@ -1434,8 +1478,6 @@ int main(int argc, char ** argv) {
14341478
llama_model * lmodel = nullptr;
14351479
const cmd_params_instance * prev_inst = nullptr;
14361480

1437-
postprocess_cpu_params(params.cpuparams);
1438-
14391481
for (const auto & inst : params_instances) {
14401482
// keep the same model between tests when possible
14411483
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1463,12 +1505,13 @@ int main(int argc, char ** argv) {
14631505
llama_kv_cache_clear(ctx);
14641506

14651507
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
1466-
tpp.strict_cpu = params.cpuparams.strict_cpu;
1467-
tpp.prio = params.cpuparams.priority;
1468-
tpp.poll = params.cpuparams.poll;
1469-
if (params.cpuparams.mask_valid) {
1470-
std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
1508+
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
1509+
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
1510+
exit(1);
14711511
}
1512+
tpp.strict_cpu = t.cpu_strict;
1513+
tpp.poll = t.poll;
1514+
tpp.prio = params.prio;
14721515

14731516
struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
14741517
if (!threadpool) {

0 commit comments

Comments
 (0)