Skip to content

Commit 069a999

Browse files
ngxsonarthw
authored andcommitted
common : improve -ctv -ctk CLI arguments (ggml-org#10806)
* common : improve ctv ctk cli argument * regenerate docs * even better approach * use std::vector
1 parent 0baed04 commit 069a999

File tree

5 files changed

+60
-51
lines changed

5 files changed

+60
-51
lines changed

common/arg.cpp

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
145145
}
146146
}
147147

148+
const std::vector<ggml_type> kv_cache_types = {
149+
GGML_TYPE_F32,
150+
GGML_TYPE_F16,
151+
GGML_TYPE_BF16,
152+
GGML_TYPE_Q8_0,
153+
GGML_TYPE_Q4_0,
154+
GGML_TYPE_Q4_1,
155+
GGML_TYPE_IQ4_NL,
156+
GGML_TYPE_Q5_0,
157+
GGML_TYPE_Q5_1,
158+
};
159+
160+
static ggml_type kv_cache_type_from_str(const std::string & s) {
161+
for (const auto & type : kv_cache_types) {
162+
if (ggml_type_name(type) == s) {
163+
return type;
164+
}
165+
}
166+
throw std::runtime_error("Unsupported cache type: " + s);
167+
}
168+
169+
static std::string get_all_kv_cache_types() {
170+
std::ostringstream msg;
171+
for (const auto & type : kv_cache_types) {
172+
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
173+
}
174+
return msg.str();
175+
}
176+
148177
//
149178
// CLI argument parsing functions
150179
//
@@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11741203
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
11751204
add_opt(common_arg(
11761205
{"-ctk", "--cache-type-k"}, "TYPE",
1177-
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
1206+
string_format(
1207+
"KV cache data type for K\n"
1208+
"allowed values: %s\n"
1209+
"(default: %s)",
1210+
get_all_kv_cache_types().c_str(),
1211+
ggml_type_name(params.cache_type_k)
1212+
),
11781213
[](common_params & params, const std::string & value) {
1179-
// TODO: get the type right here
1180-
params.cache_type_k = value;
1214+
params.cache_type_k = kv_cache_type_from_str(value);
11811215
}
11821216
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
11831217
add_opt(common_arg(
11841218
{"-ctv", "--cache-type-v"}, "TYPE",
1185-
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
1219+
string_format(
1220+
"KV cache data type for V\n"
1221+
"allowed values: %s\n"
1222+
"(default: %s)",
1223+
get_all_kv_cache_types().c_str(),
1224+
ggml_type_name(params.cache_type_v)
1225+
),
11861226
[](common_params & params, const std::string & value) {
1187-
// TODO: get the type right here
1188-
params.cache_type_v = value;
1227+
params.cache_type_v = kv_cache_type_from_str(value);
11891228
}
11901229
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
11911230
add_opt(common_arg(

common/common.cpp

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10151015
return mparams;
10161016
}
10171017

1018-
static ggml_type kv_cache_type_from_str(const std::string & s) {
1019-
if (s == "f32") {
1020-
return GGML_TYPE_F32;
1021-
}
1022-
if (s == "f16") {
1023-
return GGML_TYPE_F16;
1024-
}
1025-
if (s == "bf16") {
1026-
return GGML_TYPE_BF16;
1027-
}
1028-
if (s == "q8_0") {
1029-
return GGML_TYPE_Q8_0;
1030-
}
1031-
if (s == "q4_0") {
1032-
return GGML_TYPE_Q4_0;
1033-
}
1034-
if (s == "q4_1") {
1035-
return GGML_TYPE_Q4_1;
1036-
}
1037-
if (s == "iq4_nl") {
1038-
return GGML_TYPE_IQ4_NL;
1039-
}
1040-
if (s == "q5_0") {
1041-
return GGML_TYPE_Q5_0;
1042-
}
1043-
if (s == "q5_1") {
1044-
return GGML_TYPE_Q5_1;
1045-
}
1046-
1047-
throw std::runtime_error("Unsupported cache type: " + s);
1048-
}
1049-
10501018
struct llama_context_params common_context_params_to_llama(const common_params & params) {
10511019
auto cparams = llama_context_default_params();
10521020

@@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10811049
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
10821050
}
10831051

1084-
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1085-
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1052+
cparams.type_k = params.cache_type_k;
1053+
cparams.type_v = params.cache_type_v;
10861054

10871055
return cparams;
10881056
}

common/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,8 @@ struct common_params {
286286
bool warmup = true; // warmup run
287287
bool check_tensors = false; // validate tensor data
288288

289-
std::string cache_type_k = "f16"; // KV cache data type for the K
290-
std::string cache_type_v = "f16"; // KV cache data type for the V
289+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
290+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
291291

292292
// multimodal models (see examples/llava)
293293
std::string mmproj = ""; // path to multimodal projector // NOLINT

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ else()
5151
add_subdirectory(speculative)
5252
add_subdirectory(speculative-simple)
5353
add_subdirectory(tokenize)
54+
add_subdirectory(gen-docs)
5455
if (NOT GGML_BACKEND_DL)
5556
# these examples use the backends directly and cannot be built with dynamic loading
5657
add_subdirectory(convert-llama2c-to-ggml)

examples/server/README.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
6262
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
6363
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
6464
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
65-
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
66-
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
65+
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
66+
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
6767
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
6868
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
6969
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
@@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co
138138
| -------- | ----------- |
139139
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
140140
| `-sp, --special` | special tokens output enabled (default: false) |
141+
| `--no-warmup` | skip warming up the model with an empty run |
141142
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
142143
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
143144
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
@@ -146,7 +147,7 @@ The project is under active development, and we are [looking for feedback and co
146147
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
147148
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
148149
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
149-
| `--no-webui` | disable the Web UI<br/>(env: LLAMA_ARG_NO_WEBUI) |
150+
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
150151
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
151152
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
152153
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@@ -164,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co
164165
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
165166
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
166167
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
167-
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
168-
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
169-
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
170-
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
168+
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
169+
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
170+
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
171+
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
171172
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
172-
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
173-
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
173+
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
174+
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
174175

175176

176177
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.

0 commit comments

Comments
 (0)