From 2950f62482e65b5f51f898e7b2623e2d811bd986 Mon Sep 17 00:00:00 2001 From: matteo serva Date: Tue, 29 Apr 2025 20:22:25 +0200 Subject: [PATCH 01/16] initial commit for handling extra template kwargs --- common/chat.cpp | 9 ++++++++- common/chat.h | 2 ++ tools/server/utils.hpp | 5 +++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index 1d6974a8c563b..1d08724419d47 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -140,6 +140,7 @@ struct templates_params { bool add_generation_prompt = true; bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + json extra_context; }; common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { @@ -1691,7 +1692,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) { static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt,inputs.extra_context); data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.grammar_lazy = false; if (!inputs.json_schema.is_null()) { @@ -1722,6 +1723,12 @@ static common_chat_params common_chat_templates_apply_jinja( params.enable_thinking = inputs.enable_thinking; params.grammar = inputs.grammar; params.now = inputs.now; + + for(auto el: inputs.chat_template_kwargs) + { + params.extra_context[el.first] = json::parse(el.second); + } + if (!inputs.json_schema.empty()) { params.json_schema = json::parse(inputs.json_schema); } diff --git a/common/chat.h b/common/chat.h index 9f59e6b08738d..c9ecfd889fddb 100644 --- a/common/chat.h +++ b/common/chat.h @@ -7,6 +7,7 @@ #include #include #include +#include struct common_chat_templates; @@ -125,6 +126,7 @@ struct common_chat_templates_inputs { common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + std::map chat_template_kwargs; }; struct common_chat_params { diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f3e0392a4e9d1..5ca5884e58217 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -748,6 +748,11 @@ static json oaicompat_chat_params_parse( llama_params["parse_tool_calls"] = true; } + auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); + for (const auto & item : chat_template_kwargs_object.items()) { + inputs.chat_template_kwargs[item.key()] = item.value().dump(); + } + // if the assistant message appears at the end of list, we do not add end-of-turn token // for ex. this can be useful to modify the reasoning process in reasoning models bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; From 46064b4931d8706905fc82d61f33aadac4632049 Mon Sep 17 00:00:00 2001 From: matteo serva Date: Tue, 29 Apr 2025 21:03:47 +0200 Subject: [PATCH 02/16] enable_thinking and assistant prefill cannot be enabled at the same time --- tools/server/utils.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 5ca5884e58217..36adc0a6e0da7 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -768,6 +768,11 @@ static json oaicompat_chat_params_parse( /* TODO: test this properly */ inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; + + if(inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { + throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); + } + inputs.add_generation_prompt = true; } From 91681d451854626a91ddf7b8867be9f6f6da8d63 Mon Sep 17 00:00:00 2001 From: matteo serva Date: Wed, 30 Apr 2025 09:10:36 +0200 Subject: [PATCH 03/16] can set chat_template_kwargs in command line --- common/arg.cpp | 10 ++++++++++ common/common.h | 3 +++ tools/server/server.cpp | 1 + tools/server/utils.hpp | 5 +++++ 4 files changed, 19 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 0d0daa3610105..4b1f627247ce1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2790,6 +2790,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); + add_opt(common_arg( + {"--chat-template-kwargs"}, "STRING", + string_format("sets additional params for the json template parser"), + [](common_params & params, const std::string & value) { + auto parsed = json::parse(value); + for (const auto & item : parsed.items()) { + params.default_template_kwargs[item.key()] = item.value().dump(); + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("CHAT_TEMPLATE_KWARGS")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), diff --git a/common/common.h b/common/common.h index f26724b6e1495..64eaefec305e0 100644 --- a/common/common.h +++ b/common/common.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #ifdef _WIN32 @@ -378,6 +379,8 @@ struct common_params { std::string ssl_file_key = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT + std::map default_template_kwargs; + // "advanced" endpoints are disabled by default for better security bool webui = true; bool endpoint_slots = false; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 2e78dcd7bf1da..6737e8573df50 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2085,6 +2085,7 @@ struct server_context { /* use_jinja */ params_base.use_jinja, /* prefill_assistant */ params_base.prefill_assistant, /* reasoning_format */ params_base.reasoning_format, + /* chat_template_kwargs */ params_base.default_template_kwargs, /* common_chat_templates */ chat_templates.get(), /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 36adc0a6e0da7..4637714899b13 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -571,6 +571,7 @@ struct oaicompat_parser_options { bool use_jinja; bool prefill_assistant; common_reasoning_format reasoning_format; + const std::map chat_template_kwargs; common_chat_templates * tmpls; bool allow_image; bool allow_audio; @@ -749,6 +750,10 @@ static json oaicompat_chat_params_parse( } auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); + for (const auto & item: default_template_kwargs) + { + inputs.chat_template_kwargs[item.first] = item.second; + } for (const auto & item : chat_template_kwargs_object.items()) { inputs.chat_template_kwargs[item.key()] = item.value().dump(); } From a92e790bf955f1d4f4768ae5a572532bb9d49a12 Mon Sep 17 00:00:00 2001 From: matteo serva Date: Wed, 30 Apr 2025 13:58:06 +0200 Subject: [PATCH 04/16] added doc --- tools/server/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/server/README.md b/tools/server/README.md index 06533c172e530..26a9562cea941 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -164,6 +164,7 @@ The project is under active development, and we are [looking for feedback and co | `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | +| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"
(env: CHAT_TEMPLATE_KWARGS) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | @@ -1114,6 +1115,8 @@ See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. +`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}` + *Examples:* You can use either Python `openai` library with appropriate checkpoints: From abda1aed15f5ce974f77e32e145e3bdb597f3bd6 Mon Sep 17 00:00:00 2001 From: matteo serva Date: Wed, 30 Apr 2025 21:11:01 +0200 Subject: [PATCH 05/16] fixed formatting --- common/chat.cpp | 3 +-- tools/server/utils.hpp | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 1d08724419d47..21b4b34b2b5bc 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1724,8 +1724,7 @@ static common_chat_params common_chat_templates_apply_jinja( params.grammar = inputs.grammar; params.now = inputs.now; - for(auto el: inputs.chat_template_kwargs) - { + for (auto el: inputs.chat_template_kwargs) { params.extra_context[el.first] = json::parse(el.second); } diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 4637714899b13..f29196c6dcec1 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -750,8 +750,7 @@ static json oaicompat_chat_params_parse( } auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); - for (const auto & item: default_template_kwargs) - { + for (const auto & item: default_template_kwargs) { inputs.chat_template_kwargs[item.first] = item.second; } for (const auto & item : chat_template_kwargs_object.items()) { @@ -774,7 +773,7 @@ static json oaicompat_chat_params_parse( /* TODO: test this properly */ inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; - if(inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { + if (inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); } From 570018b15a749169944f8c4473e3da273a5b46bf Mon Sep 17 00:00:00 2001 From: matteo serva Date: Sat, 17 May 2025 18:00:03 +0200 Subject: [PATCH 06/16] add support for extra context in generic template init --- common/chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index 21b4b34b2b5bc..8f46937914340 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -829,7 +829,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp inputs.messages, "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); - data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, inputs.extra_context); data.format = COMMON_CHAT_FORMAT_GENERIC; return data; } From 8c8b29038d4a56ab4780754d57844eb47ab28377 Mon Sep 17 00:00:00 2001 From: matteo Date: Fri, 16 May 2025 08:22:43 +0200 Subject: [PATCH 07/16] coding standard: common/chat.cpp Co-authored-by: Georgi Gerganov --- common/chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index 8f46937914340..f917cccf014e1 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1692,7 +1692,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) { static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt,inputs.extra_context); + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, inputs.extra_context); data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.grammar_lazy = false; if (!inputs.json_schema.is_null()) { From 56b3a691dcd3a20106ecec19a8e32c572555fa28 Mon Sep 17 00:00:00 2001 From: matteo Date: Fri, 16 May 2025 08:22:56 +0200 Subject: [PATCH 08/16] coding standard: common/chat.cpp Co-authored-by: Georgi Gerganov --- common/chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index f917cccf014e1..371c6c98761fd 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1724,7 +1724,7 @@ static common_chat_params common_chat_templates_apply_jinja( params.grammar = inputs.grammar; params.now = inputs.now; - for (auto el: inputs.chat_template_kwargs) { + for (auto el : inputs.chat_template_kwargs) { params.extra_context[el.first] = json::parse(el.second); } From fe6e44ad3a4a5421da593d956df5d2480fff38ab Mon Sep 17 00:00:00 2001 From: matteo Date: Fri, 16 May 2025 08:23:40 +0200 Subject: [PATCH 09/16] Apply suggestions from code review coding standard: cosmetic changes Co-authored-by: Georgi Gerganov --- common/chat.h | 2 +- common/common.h | 2 +- tools/server/utils.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/chat.h b/common/chat.h index c9ecfd889fddb..ca807c145ee82 100644 --- a/common/chat.h +++ b/common/chat.h @@ -126,7 +126,7 @@ struct common_chat_templates_inputs { common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); - std::map chat_template_kwargs; + std::map chat_template_kwargs; }; struct common_chat_params { diff --git a/common/common.h b/common/common.h index 64eaefec305e0..bc951b2dc3ba3 100644 --- a/common/common.h +++ b/common/common.h @@ -379,7 +379,7 @@ struct common_params { std::string ssl_file_key = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT - std::map default_template_kwargs; + std::map default_template_kwargs; // "advanced" endpoints are disabled by default for better security bool webui = true; diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f29196c6dcec1..8f2d92aeecbb4 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -750,7 +750,7 @@ static json oaicompat_chat_params_parse( } auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); - for (const auto & item: default_template_kwargs) { + for (const auto & item : default_template_kwargs) { inputs.chat_template_kwargs[item.first] = item.second; } for (const auto & item : chat_template_kwargs_object.items()) { From 67789ef08c2cc5b098aec88a924cfa4c27ca12a6 Mon Sep 17 00:00:00 2001 From: matteo serva Date: Fri, 23 May 2025 17:11:57 +0200 Subject: [PATCH 10/16] fix merge conflict --- tools/server/utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 8f2d92aeecbb4..4181e00752974 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -571,7 +571,7 @@ struct oaicompat_parser_options { bool use_jinja; bool prefill_assistant; common_reasoning_format reasoning_format; - const std::map chat_template_kwargs; + std::map chat_template_kwargs; common_chat_templates * tmpls; bool allow_image; bool allow_audio; @@ -750,7 +750,7 @@ static json oaicompat_chat_params_parse( } auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); - for (const auto & item : default_template_kwargs) { + for (const auto & item : opt.chat_template_kwargs) { inputs.chat_template_kwargs[item.first] = item.second; } for (const auto & item : chat_template_kwargs_object.items()) { From 9a93863226d488dfb4c927bc045a7b82f53ddba7 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Mon, 26 May 2025 23:15:02 +0100 Subject: [PATCH 11/16] chat.cpp: simplify calls to apply to ensure systematic propagation of extra_context (+ the odd existing additional_context) --- common/chat.cpp | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 371c6c98761fd..4b093b2eaa6da 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -17,6 +17,8 @@ #include #include +using json = nlohmann::ordered_json; + static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) { auto time = std::chrono::system_clock::to_time_t(now); auto local_time = *std::localtime(&time); @@ -721,16 +723,23 @@ static void foreach_function(const json & tools, const std::function & messages_override = std::nullopt, + const std::optional & tools_override = std::nullopt, + const std::optional & additional_context = std::nullopt) { minja::chat_template_inputs tmpl_inputs; - tmpl_inputs.messages = messages; - tmpl_inputs.tools = tools; - tmpl_inputs.add_generation_prompt = add_generation_prompt; - tmpl_inputs.extra_context = extra_context; + tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages; + if (tools_override) { + tmpl_inputs.tools = *tools_override; + } else { + tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools; + } + tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt; + tmpl_inputs.extra_context = inputs.extra_context; + if (additional_context) { + tmpl_inputs.extra_context.merge_patch(*additional_context); + } // TODO: add flag to control date/time, if only for testing purposes. // tmpl_inputs.now = std::chrono::system_clock::now(); @@ -829,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp inputs.messages, "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); - data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, inputs.extra_context); + data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages); data.format = COMMON_CHAT_FORMAT_GENERIC; return data; } @@ -905,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat data.preserved_tokens = { "[TOOL_CALLS]", }; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; return data; } @@ -935,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ adjusted_messages.push_back(msg); } } - data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); + data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages); data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; if (string_ends_with(data.prompt, "<|START_THINKING|>")) { if (!inputs.enable_thinking) { @@ -1123,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te } else { data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; } - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { + data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json { {"date_string", format_time(inputs.now, "%d %b %Y")}, {"tools_in_user_message", false}, {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, @@ -1188,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + auto prompt = apply(tmpl, inputs); // Hacks to fix the official (broken) prompt. // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead, @@ -1283,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) { static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { LOG_DBG("%s\n", __func__); common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { + data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json { {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")}, {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, }); @@ -1339,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code. common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; if (inputs.tools.is_array() && !inputs.tools.empty()) { data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; @@ -1466,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; } - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs); // TODO: if (has_raw_python) return data; } @@ -1499,11 +1508,9 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - json additional_context = { + data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json { {"enable_thinking", inputs.enable_thinking}, - }; - - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context); + }); data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; if (string_ends_with(data.prompt, "\n")) { if (!inputs.enable_thinking) { @@ -1692,7 +1699,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) { static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, inputs.extra_context); + data.prompt = apply(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.grammar_lazy = false; if (!inputs.json_schema.is_null()) { From 74f6060ce42c53d1f5bf2a4778618f9f8e5f43de Mon Sep 17 00:00:00 2001 From: matteo serva Date: Tue, 27 May 2025 08:39:58 +0200 Subject: [PATCH 12/16] normalize environment variable name --- common/arg.cpp | 2 +- tools/server/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 4b1f627247ce1..92e1473431cbc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2799,7 +2799,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.default_template_kwargs[item.key()] = item.value().dump(); } } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("CHAT_TEMPLATE_KWARGS")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), diff --git a/tools/server/README.md b/tools/server/README.md index 26a9562cea941..ac849fe253de9 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -164,7 +164,7 @@ The project is under active development, and we are [looking for feedback and co | `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | -| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"
(env: CHAT_TEMPLATE_KWARGS) | +| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | From cdc3cbe0066e8f34e0093fb79f04aad66e3dfcbe Mon Sep 17 00:00:00 2001 From: matteo serva Date: Tue, 27 May 2025 08:43:30 +0200 Subject: [PATCH 13/16] simplify code --- tools/server/utils.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 4181e00752974..27720ec937f81 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -749,10 +749,9 @@ static json oaicompat_chat_params_parse( llama_params["parse_tool_calls"] = true; } + // merge the template args provided from command line with the args provided in the user request auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); - for (const auto & item : opt.chat_template_kwargs) { - inputs.chat_template_kwargs[item.first] = item.second; - } + inputs.chat_template_kwargs = opt.chat_template_kwargs; for (const auto & item : chat_template_kwargs_object.items()) { inputs.chat_template_kwargs[item.key()] = item.value().dump(); } From 226e37d8e023933ad0d7abf390eb8c3ab9a0afee Mon Sep 17 00:00:00 2001 From: matteo serva Date: Tue, 27 May 2025 08:44:26 +0200 Subject: [PATCH 14/16] prefill cannot be used with thinking models --- tools/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 27720ec937f81..955ef73588916 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -772,7 +772,7 @@ static json oaicompat_chat_params_parse( /* TODO: test this properly */ inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; - if (inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { + if (inputs.enable_thinking || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); } From 4e1c329d0202af0d9ddbcf8a393ed57820f2bc74 Mon Sep 17 00:00:00 2001 From: matteo serva Date: Tue, 27 May 2025 09:28:27 +0200 Subject: [PATCH 15/16] compatibility with the new reasoning-budget parameter --- common/chat.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 4b093b2eaa6da..4c298ef083607 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1508,12 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json { + json extra_context = json { {"enable_thinking", inputs.enable_thinking}, - }); + }; + extra_context.update(inputs.extra_context); + + data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context); data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; if (string_ends_with(data.prompt, "\n")) { - if (!inputs.enable_thinking) { + if (!extra_context["enable_thinking"]) { data.prompt += ""; } else { data.thinking_forced_open = true; @@ -1731,6 +1734,7 @@ static common_chat_params common_chat_templates_apply_jinja( params.grammar = inputs.grammar; params.now = inputs.now; + params.extra_context = json::object(); for (auto el : inputs.chat_template_kwargs) { params.extra_context[el.first] = json::parse(el.second); } From a056e536a40f5bfeeef0c613b0d69eaef7e1918c Mon Sep 17 00:00:00 2001 From: matteo serva Date: Sun, 8 Jun 2025 16:47:27 +0200 Subject: [PATCH 16/16] fix prefill for non thinking models --- tools/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 955ef73588916..c4a058834021a 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -772,7 +772,7 @@ static json oaicompat_chat_params_parse( /* TODO: test this properly */ inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; - if (inputs.enable_thinking || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { + if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); }