Skip to content

Commit cf0a43b

Browse files
authored
llama-bench : add defrag-thold, check for invalid ranges (#13487)
1 parent f0d46ef commit cf0a43b

File tree

3 files changed

+49
-15
lines changed

3 files changed

+49
-15
lines changed

include/llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ extern "C" {
345345
float yarn_beta_fast; // YaRN low correction dim
346346
float yarn_beta_slow; // YaRN high correction dim
347347
uint32_t yarn_orig_ctx; // YaRN original context size
348-
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
348+
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
349349

350350
ggml_backend_sched_eval_callback cb_eval;
351351
void * cb_eval_user_data;

tools/llama-bench/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,13 @@ test parameters:
4343
-ub, --ubatch-size <n> (default: 512)
4444
-ctk, --cache-type-k <t> (default: f16)
4545
-ctv, --cache-type-v <t> (default: f16)
46-
-t, --threads <n> (default: 16)
46+
-dt, --defrag-thold <f> (default: -1)
47+
-t, --threads <n> (default: system dependent)
4748
-C, --cpu-mask <hex,hex> (default: 0x0)
4849
--cpu-strict <0|1> (default: 0)
4950
--poll <0...100> (default: 50)
5051
-ngl, --n-gpu-layers <n> (default: 99)
51-
-rpc, --rpc <rpc_servers> (default: )
52+
-rpc, --rpc <rpc_servers> (default: none)
5253
-sm, --split-mode <none|layer|row> (default: layer)
5354
-mg, --main-gpu <i> (default: 0)
5455
-nkvo, --no-kv-offload <0|1> (default: 0)
@@ -62,7 +63,7 @@ test parameters:
6263
6364
Multiple values can be given for each parameter by separating them with ','
6465
or by specifying the parameter multiple times. Ranges can be given as
65-
'start-end' or 'start-end+step' or 'start-end*mult'.
66+
'first-last' or 'first-last+step' or 'first-last*mult'.
6667
```
6768

6869
llama-bench can perform three types of tests:

tools/llama-bench/llama-bench.cpp

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,19 @@ static std::vector<int> parse_int_range(const std::string & s) {
211211
for (int i = first; i <= last;) {
212212
result.push_back(i);
213213

214+
int prev_i = i;
215+
214216
if (op == '+') {
215217
i += step;
216218
} else if (op == '*') {
217219
i *= step;
218220
} else {
219221
throw std::invalid_argument("invalid range format");
220222
}
223+
224+
if (i <= prev_i) {
225+
throw std::invalid_argument("invalid range");
226+
}
221227
}
222228
search_start = match.suffix().first;
223229
}
@@ -239,6 +245,7 @@ struct cmd_params {
239245
std::vector<int> n_ubatch;
240246
std::vector<ggml_type> type_k;
241247
std::vector<ggml_type> type_v;
248+
std::vector<float> defrag_thold;
242249
std::vector<int> n_threads;
243250
std::vector<std::string> cpu_mask;
244251
std::vector<bool> cpu_strict;
@@ -274,6 +281,7 @@ static const cmd_params cmd_params_defaults = {
274281
/* n_ubatch */ { 512 },
275282
/* type_k */ { GGML_TYPE_F16 },
276283
/* type_v */ { GGML_TYPE_F16 },
284+
/* defrag_thold */ { -1.0f },
277285
/* n_threads */ { cpu_get_num_math() },
278286
/* cpu_mask */ { "0x0" },
279287
/* cpu_strict */ { false },
@@ -335,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
335343
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
336344
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
337345
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
346+
printf(" -dt, --defrag-thold <f> (default: %s)\n",
347+
join(cmd_params_defaults.defrag_thold, ",").c_str());
338348
printf(" -t, --threads <n> (default: %s)\n",
339349
join(cmd_params_defaults.n_threads, ",").c_str());
340350
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
@@ -368,7 +378,7 @@ static void print_usage(int /* argc */, char ** argv) {
368378
printf(
369379
"Multiple values can be given for each parameter by separating them with ','\n"
370380
"or by specifying the parameter multiple times. Ranges can be given as\n"
371-
"'start-end' or 'start-end+step' or 'start-end*mult'.\n");
381+
"'first-last' or 'first-last+step' or 'first-last*mult'.\n");
372382
}
373383

374384
static ggml_type ggml_type_from_name(const std::string & s) {
@@ -519,6 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
519529
break;
520530
}
521531
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
532+
} else if (arg == "-dt" || arg == "--defrag-thold") {
533+
if (++i >= argc) {
534+
invalid_param = true;
535+
break;
536+
}
537+
auto p = string_split<float>(argv[i], split_delim);
538+
params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
522539
} else if (arg == "-t" || arg == "--threads") {
523540
if (++i >= argc) {
524541
invalid_param = true;
@@ -825,6 +842,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
825842
if (params.type_v.empty()) {
826843
params.type_v = cmd_params_defaults.type_v;
827844
}
845+
if (params.defrag_thold.empty()) {
846+
params.defrag_thold = cmd_params_defaults.defrag_thold;
847+
}
828848
if (params.n_gpu_layers.empty()) {
829849
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
830850
}
@@ -883,6 +903,7 @@ struct cmd_params_instance {
883903
int n_ubatch;
884904
ggml_type type_k;
885905
ggml_type type_v;
906+
float defrag_thold;
886907
int n_threads;
887908
std::string cpu_mask;
888909
bool cpu_strict;
@@ -959,15 +980,16 @@ struct cmd_params_instance {
959980
llama_context_params to_llama_cparams() const {
960981
llama_context_params cparams = llama_context_default_params();
961982

962-
cparams.n_ctx = n_prompt + n_gen + n_depth;
963-
cparams.n_batch = n_batch;
964-
cparams.n_ubatch = n_ubatch;
965-
cparams.type_k = type_k;
966-
cparams.type_v = type_v;
967-
cparams.offload_kqv = !no_kv_offload;
968-
cparams.flash_attn = flash_attn;
969-
cparams.embeddings = embeddings;
970-
cparams.op_offload = !no_op_offload;
983+
cparams.n_ctx = n_prompt + n_gen + n_depth;
984+
cparams.n_batch = n_batch;
985+
cparams.n_ubatch = n_ubatch;
986+
cparams.type_k = type_k;
987+
cparams.type_v = type_v;
988+
cparams.defrag_thold = defrag_thold;
989+
cparams.offload_kqv = !no_kv_offload;
990+
cparams.flash_attn = flash_attn;
991+
cparams.embeddings = embeddings;
992+
cparams.op_offload = !no_op_offload;
971993

972994
return cparams;
973995
}
@@ -992,6 +1014,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
9921014
for (const auto & nub : params.n_ubatch)
9931015
for (const auto & tk : params.type_k)
9941016
for (const auto & tv : params.type_v)
1017+
for (const auto & defrag_thold : params.defrag_thold)
9951018
for (const auto & nkvo : params.no_kv_offload)
9961019
for (const auto & fa : params.flash_attn)
9971020
for (const auto & nt : params.n_threads)
@@ -1012,6 +1035,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10121035
/* .n_ubatch = */ nub,
10131036
/* .type_k = */ tk,
10141037
/* .type_v = */ tv,
1038+
/* .defrag_thold = */ defrag_thold,
10151039
/* .n_threads = */ nt,
10161040
/* .cpu_mask = */ cm,
10171041
/* .cpu_strict = */ cs,
@@ -1044,6 +1068,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10441068
/* .n_ubatch = */ nub,
10451069
/* .type_k = */ tk,
10461070
/* .type_v = */ tv,
1071+
/* .defrag_thold = */ defrag_thold,
10471072
/* .n_threads = */ nt,
10481073
/* .cpu_mask = */ cm,
10491074
/* .cpu_strict = */ cs,
@@ -1076,6 +1101,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
10761101
/* .n_ubatch = */ nub,
10771102
/* .type_k = */ tk,
10781103
/* .type_v = */ tv,
1104+
/* .defrag_thold = */ defrag_thold,
10791105
/* .n_threads = */ nt,
10801106
/* .cpu_mask = */ cm,
10811107
/* .cpu_strict = */ cs,
@@ -1117,6 +1143,7 @@ struct test {
11171143
int poll;
11181144
ggml_type type_k;
11191145
ggml_type type_v;
1146+
float defrag_thold;
11201147
int n_gpu_layers;
11211148
llama_split_mode split_mode;
11221149
int main_gpu;
@@ -1151,6 +1178,7 @@ struct test {
11511178
poll = inst.poll;
11521179
type_k = inst.type_k;
11531180
type_v = inst.type_v;
1181+
defrag_thold = inst.defrag_thold;
11541182
n_gpu_layers = inst.n_gpu_layers;
11551183
split_mode = inst.split_mode;
11561184
main_gpu = inst.main_gpu;
@@ -1206,6 +1234,7 @@ struct test {
12061234
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
12071235
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
12081236
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1237+
"defrag_thold",
12091238
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
12101239
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
12111240
};
@@ -1225,7 +1254,7 @@ struct test {
12251254
field == "use_mmap" || field == "embeddings") {
12261255
return BOOL;
12271256
}
1228-
if (field == "avg_ts" || field == "stddev_ts") {
1257+
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
12291258
return FLOAT;
12301259
}
12311260
return STRING;
@@ -1292,6 +1321,7 @@ struct test {
12921321
std::to_string(flash_attn),
12931322
tensor_split_str,
12941323
tensor_buft_overrides_str,
1324+
std::to_string(defrag_thold),
12951325
std::to_string(use_mmap),
12961326
std::to_string(embeddings),
12971327
std::to_string(no_op_offload),
@@ -1558,6 +1588,9 @@ struct markdown_printer : public printer {
15581588
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
15591589
fields.emplace_back("type_v");
15601590
}
1591+
if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
1592+
fields.emplace_back("defrag_thold");
1593+
}
15611594
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
15621595
fields.emplace_back("main_gpu");
15631596
}

0 commit comments

Comments
 (0)