diff --git a/common/arg.cpp b/common/arg.cpp index b6cb0aa9180c1..7e922452f21d8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2790,6 +2790,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); + add_opt(common_arg( + {"--chat-template-kwargs"}, "STRING", + string_format("sets additional params for the json template parser"), + [](common_params & params, const std::string & value) { + auto parsed = json::parse(value); + for (const auto & item : parsed.items()) { + params.default_template_kwargs[item.key()] = item.value().dump(); + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), diff --git a/common/chat.cpp b/common/chat.cpp index f1ab4c85a913e..3120abde4ffce 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -17,6 +17,8 @@ #include #include +using json = nlohmann::ordered_json; + static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) { auto time = std::chrono::system_clock::to_time_t(now); auto local_time = *std::localtime(&time); @@ -140,6 +142,7 @@ struct templates_params { bool add_generation_prompt = true; bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + json extra_context; }; common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { @@ -719,16 +722,23 @@ static void foreach_function(const json & tools, const std::function & messages_override = std::nullopt, + const std::optional & tools_override = std::nullopt, + const std::optional & additional_context = std::nullopt) { minja::chat_template_inputs tmpl_inputs; - tmpl_inputs.messages = messages; - tmpl_inputs.tools = tools; - tmpl_inputs.add_generation_prompt = add_generation_prompt; - tmpl_inputs.extra_context = extra_context; + tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages; + if (tools_override) { + tmpl_inputs.tools = *tools_override; + } else { + tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools; + } + tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt; + tmpl_inputs.extra_context = inputs.extra_context; + if (additional_context) { + tmpl_inputs.extra_context.merge_patch(*additional_context); + } // TODO: add flag to control date/time, if only for testing purposes. // tmpl_inputs.now = std::chrono::system_clock::now(); @@ -827,7 +837,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp inputs.messages, "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); - data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages); data.format = COMMON_CHAT_FORMAT_GENERIC; return data; } @@ -903,7 +913,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat data.preserved_tokens = { "[TOOL_CALLS]", }; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; return data; } @@ -933,7 +943,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ adjusted_messages.push_back(msg); } } - data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); + data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages); data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; if (string_ends_with(data.prompt, "<|START_THINKING|>")) { if (!inputs.enable_thinking) { @@ -1121,7 +1131,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te } else { data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; } - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { + data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json { {"date_string", format_time(inputs.now, "%d %b %Y")}, {"tools_in_user_message", false}, {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, @@ -1186,7 +1196,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + auto prompt = apply(tmpl, inputs); // Hacks to fix the official (broken) prompt. // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead, @@ -1281,7 +1291,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) { static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { LOG_DBG("%s\n", __func__); common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { + data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json { {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")}, {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, }); @@ -1337,7 +1347,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code. common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; if (inputs.tools.is_array() && !inputs.tools.empty()) { data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; @@ -1464,7 +1474,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; } - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs); // TODO: if (has_raw_python) return data; } @@ -1497,14 +1507,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - json additional_context = { + json extra_context = json { {"enable_thinking", inputs.enable_thinking}, }; + extra_context.update(inputs.extra_context); - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context); + data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context); data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; if (string_ends_with(data.prompt, "\n")) { - if (!inputs.enable_thinking) { + if (!extra_context["enable_thinking"]) { data.prompt += ""; } else { data.thinking_forced_open = true; @@ -1690,7 +1701,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) { static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.prompt = apply(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.grammar_lazy = false; if (!inputs.json_schema.is_null()) { @@ -1721,6 +1732,12 @@ static common_chat_params common_chat_templates_apply_jinja( params.enable_thinking = inputs.enable_thinking; params.grammar = inputs.grammar; params.now = inputs.now; + + params.extra_context = json::object(); + for (auto el : inputs.chat_template_kwargs) { + params.extra_context[el.first] = json::parse(el.second); + } + if (!inputs.json_schema.empty()) { params.json_schema = json::parse(inputs.json_schema); } diff --git a/common/chat.h b/common/chat.h index f6b1d0ffcc989..cfb626f652a49 100644 --- a/common/chat.h +++ b/common/chat.h @@ -7,6 +7,7 @@ #include #include #include +#include struct common_chat_templates; @@ -125,6 +126,7 @@ struct common_chat_templates_inputs { common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + std::map chat_template_kwargs; }; struct common_chat_params { diff --git a/common/common.h b/common/common.h index cee1e3039cf9e..eef5298ddda59 100644 --- a/common/common.h +++ b/common/common.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #ifdef _WIN32 @@ -377,6 +378,8 @@ struct common_params { std::string ssl_file_key = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT + std::map default_template_kwargs; + // "advanced" endpoints are disabled by default for better security bool webui = true; bool endpoint_slots = false; diff --git a/tools/server/README.md b/tools/server/README.md index 06533c172e530..ac849fe253de9 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -164,6 +164,7 @@ The project is under active development, and we are [looking for feedback and co | `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | +| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | @@ -1114,6 +1115,8 @@ See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. +`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}` + *Examples:* You can use either Python `openai` library with appropriate checkpoints: diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 90981ff9a5ef7..342fd2b4253a7 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2085,6 +2085,7 @@ struct server_context { /* use_jinja */ params_base.use_jinja, /* prefill_assistant */ params_base.prefill_assistant, /* reasoning_format */ params_base.reasoning_format, + /* chat_template_kwargs */ params_base.default_template_kwargs, /* common_chat_templates */ chat_templates.get(), /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f3e0392a4e9d1..955ef73588916 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -571,6 +571,7 @@ struct oaicompat_parser_options { bool use_jinja; bool prefill_assistant; common_reasoning_format reasoning_format; + std::map chat_template_kwargs; common_chat_templates * tmpls; bool allow_image; bool allow_audio; @@ -748,6 +749,13 @@ static json oaicompat_chat_params_parse( llama_params["parse_tool_calls"] = true; } + // merge the template args provided from command line with the args provided in the user request + auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); + inputs.chat_template_kwargs = opt.chat_template_kwargs; + for (const auto & item : chat_template_kwargs_object.items()) { + inputs.chat_template_kwargs[item.key()] = item.value().dump(); + } + // if the assistant message appears at the end of list, we do not add end-of-turn token // for ex. this can be useful to modify the reasoning process in reasoning models bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; @@ -763,6 +771,11 @@ static json oaicompat_chat_params_parse( /* TODO: test this properly */ inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; + + if (inputs.enable_thinking || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { + throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); + } + inputs.add_generation_prompt = true; }