Commit c7f460a

ochafikggerganov

and

authored

server: fix tool-call of DeepSeek R1 Qwen, return reasoning_content (Command 7RB & DeepSeek R1) unless --reasoning-format none (ggml-org#11607)

* extract & return thoughts in reasoning_content field (unless --reasoning-format) for DeepSeek R1 & Command R7B * tool-calls: add deepseek r1 template (models/templates/llama-cpp-deepseek-r1.jinja) + hackommodate broken official template * tool-calls: accommodate variety of wrong tool call opening tags both R1 Qwen 32B and 7B distills like to spit out * server/oai: ensure content is null when there are tool calls, and reasoning_content appears before content for readability * tool-calls: add DeepSeek R1 Qwen distills to server/README.md & server tests Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

1 parent 27e8a23 commit c7f460aCopy full SHA for c7f460a

File tree

17 files changed

+1024

-317

lines changed

common
examples/server
- README.md
- server.cpp
- tests
  - unit
    - test_tool_call.py
  - utils.py
- utils.hpp
models/templates
scripts
- get_chat_template.py
src
- llama-grammar.cpp
tests
- test-chat.cpp

17 files changed

+1024

-317

lines changed

`‎common/arg.cpp‎`

Lines changed: 11 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1982,6 +1982,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`1982`	`1982`	`params.use_jinja = true;`
`1983`	`1983`	`}`
`1984`	`1984`	`).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));`
	`1985`	`+ add_opt(common_arg(`
	`1986`	`+ {"--reasoning-format"}, "FORMAT",`
	`1987`	`+ "reasoning format (default: deepseek; allowed values: deepseek, none)\n"`
	`1988`	+ "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
	`1989`	`+ "only supported for non-streamed responses",`
	`1990`	`+ [](common_params & params, const std::string & value) {`
	`1991`	`+ /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }`
	`1992`	`+ else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }`
	`1993`	`+ else { std::invalid_argument("invalid value"); }`
	`1994`	`+ }`
	`1995`	`+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));`
`1985`	`1996`	`add_opt(common_arg(`
`1986`	`1997`	`{"--chat-template"}, "JINJA_TEMPLATE",`
`1987`	`1998`	`string_format(`

`‎common/chat.cpp‎`

Lines changed: 219 additions & 100 deletions

Large diffs are not rendered by default.

`‎common/chat.hpp‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ struct common_chat_inputs {`
`19`	`19`	`bool stream;`
`20`	`20`	`std::string grammar;`
`21`	`21`	`bool add_generation_prompt = true;`
	`22`	`+ bool extract_reasoning = true;`
`22`	`23`	`};`
`23`	`24`
`24`	`25`	`enum common_chat_format {`
`@@ -28,11 +29,13 @@ enum common_chat_format {`
`28`	`29`	`COMMON_CHAT_FORMAT_LLAMA_3_X,`
`29`	`30`	`COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,`
`30`	`31`	`COMMON_CHAT_FORMAT_DEEPSEEK_R1,`
	`32`	`+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,`
`31`	`33`	`COMMON_CHAT_FORMAT_FIREFUNCTION_V2,`
`32`	`34`	`COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,`
`33`	`35`	`COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,`
`34`	`36`	`COMMON_CHAT_FORMAT_HERMES_2_PRO,`
`35`	`37`	`COMMON_CHAT_FORMAT_COMMAND_R7B,`
	`38`	`+ COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,`
`36`	`39`
`37`	`40`	`COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats`
`38`	`41`	`};`

`‎common/common.h‎`

Lines changed: 7 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -203,6 +203,11 @@ struct common_params_vocoder {`
`203`	`203`	`bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT`
`204`	`204`	`};`
`205`	`205`
	`206`	`+enum common_reasoning_format {`
	`207`	`+ COMMON_REASONING_FORMAT_NONE,`
	`208`	+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
	`209`	`+};`
	`210`	`+`
`206`	`211`	`struct common_params {`
`207`	`212`	`int32_t n_predict = -1; // new tokens to predict`
`208`	`213`	`int32_t n_ctx = 4096; // context size`
`@@ -347,6 +352,7 @@ struct common_params {`
`347`	`352`	`std::string chat_template = ""; // NOLINT`
`348`	`353`	`bool use_jinja = false; // NOLINT`
`349`	`354`	`bool enable_chat_template = true;`
	`355`	`+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;`
`350`	`356`
`351`	`357`	`std::vector<std::string> api_keys;`
`352`	`358`
`@@ -624,7 +630,7 @@ struct common_chat_msg {`
`624`	`630`	`std::string role;`
`625`	`631`	`std::string content;`
`626`	`632`	`std::vector<common_tool_call> tool_calls;`
`627`		`- std::string tool_plan = "";`
	`633`	`+ std::string reasoning_content = "";`
`628`	`634`	`};`
`629`	`635`
`630`	`636`	`// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid`

`‎common/sampling.cpp‎`

Lines changed: 6 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -151,12 +151,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co`
`151`	`151`
`152`	`152`	`lparams.no_perf = params.no_perf;`
`153`	`153`
`154`		`- std::vector<const char *> trigger_words;`
`155`		`- trigger_words.reserve(params.grammar_trigger_words.size());`
`156`		`- for (const auto & str : params.grammar_trigger_words) {`
`157`		`- trigger_words.push_back(str.word.c_str());`
`158`		`- }`
`159`		`-`
`160`	`154`	`struct llama_sampler * grmr;`
`161`	`155`	`if (params.grammar.compare(0, 11, "%llguidance") == 0) {`
`162`	`156`	`#ifdef LLAMA_USE_LLGUIDANCE`
`@@ -165,6 +159,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co`
`165`	`159`	`GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");`
`166`	`160`	`#endif // LLAMA_USE_LLGUIDANCE`
`167`	`161`	`} else {`
	`162`	`+ std::vector<const char *> trigger_words;`
	`163`	`+ trigger_words.reserve(params.grammar_trigger_words.size());`
	`164`	`+ for (const auto & str : params.grammar_trigger_words) {`
	`165`	`+ trigger_words.push_back(str.word.c_str());`
	`166`	`+ }`
	`167`	`+`
`168`	`168`	`grmr = params.grammar_lazy`
`169`	`169`	`? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",`
`170`	`170`	`trigger_words.data(), trigger_words.size(),`

`‎examples/server/README.md‎`

Lines changed: 251 additions & 50 deletions

Large diffs are not rendered by default.

`‎examples/server/server.cpp‎`

Lines changed: 16 additions & 13 deletions

Original file line number	Diff line number	Diff line change
`@@ -173,6 +173,7 @@ struct slot_params {`
`173`	`173`	`{"grammar_trigger_words", grammar_trigger_words},`
`174`	`174`	`{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},`
`175`	`175`	`{"preserved_tokens", sampling.preserved_tokens},`
	`176`	`+ {"chat_format", common_chat_format_name(oaicompat_chat_format)},`
`176`	`177`	`{"samplers", samplers},`
`177`	`178`	`{"speculative.n_max", speculative.n_max},`
`178`	`179`	`{"speculative.n_min", speculative.n_min},`
`@@ -724,9 +725,19 @@ struct server_task_result_cmpl_final : server_task_result {`
`724`	`725`	`msg.content = content;`
`725`	`726`	`}`
`726`	`727`
`727`		`- json tool_calls;`
	`728`	`+ json message {`
	`729`	`+ {"role", "assistant"},`
	`730`	`+ };`
	`731`	`+ if (!msg.reasoning_content.empty()) {`
	`732`	`+ message["reasoning_content"] = msg.reasoning_content;`
	`733`	`+ }`
	`734`	`+ if (msg.content.empty() && !msg.tool_calls.empty()) {`
	`735`	`+ message["content"] = json();`
	`736`	`+ } else {`
	`737`	`+ message["content"] = msg.content;`
	`738`	`+ }`
`728`	`739`	`if (!msg.tool_calls.empty()) {`
`729`		`- tool_calls = json::array();`
	`740`	`+ autotool_calls = json::array();`
`730`	`741`	`for (const auto & tc : msg.tool_calls) {`
`731`	`742`	`tool_calls.push_back({`
`732`	`743`	`{"type", "function"},`
`@@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {`
`737`	`748`	`{"id", tc.id},`
`738`	`749`	`});`
`739`	`750`	`}`
`740`		`- }`
`741`		`-`
`742`		`- json message {`
`743`		`- {"content", msg.content},`
`744`		`- {"tool_calls", tool_calls},`
`745`		`- {"role", "assistant"},`
`746`		`- };`
`747`		`- if (!msg.tool_plan.empty()) {`
`748`		`- message["tool_plan"] = msg.tool_plan;`
	`751`	`+ message["tool_calls"] = tool_calls;`
`749`	`752`	`}`
`750`	`753`
`751`	`754`	`json choice {`
`@@ -4060,7 +4063,7 @@ int main(int argc, char ** argv) {`
`4060`	`4063`	`}`
`4061`	`4064`
`4062`	`4065`	`auto body = json::parse(req.body);`
`4063`		`- json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);`
	`4066`	`+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);`
`4064`	`4067`
`4065`	`4068`	`return handle_completions_impl(`
`4066`	`4069`	`SERVER_TASK_TYPE_COMPLETION,`
`@@ -4073,7 +4076,7 @@ int main(int argc, char ** argv) {`
`4073`	`4076`	`// same with handle_chat_completions, but without inference part`
`4074`	`4077`	`const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {`
`4075`	`4078`	`auto body = json::parse(req.body);`
`4076`		`- json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);`
	`4079`	`+ json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);`
`4077`	`4080`	`res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});`
`4078`	`4081`	`};`
`4079`	`4082`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit c7f460a

File tree

17 files changed

17 files changed

`‎common/arg.cpp‎`

`‎common/chat.cpp‎`

`‎common/chat.hpp‎`

`‎common/common.h‎`

`‎common/sampling.cpp‎`

`‎examples/server/README.md‎`

`‎examples/server/server.cpp‎`

0 commit comments