Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit c7f460a

Browse files
ochafikggerganov
andauthored
server: fix tool-call of DeepSeek R1 Qwen, return reasoning_content (Command 7RB & DeepSeek R1) unless --reasoning-format none (ggml-org#11607)
* extract & return thoughts in reasoning_content field (unless --reasoning-format) for DeepSeek R1 & Command R7B * tool-calls: add deepseek r1 template (models/templates/llama-cpp-deepseek-r1.jinja) + hackommodate broken official template * tool-calls: accommodate variety of wrong tool call opening tags both R1 Qwen 32B and 7B distills like to spit out * server/oai: ensure content is null when there are tool calls, and reasoning_content appears before content for readability * tool-calls: add DeepSeek R1 Qwen distills to server/README.md & server tests Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 27e8a23 commit c7f460a

File tree

17 files changed

+1024
-317
lines changed

17 files changed

+1024
-317
lines changed

‎common/arg.cpp‎

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1982,6 +1982,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19821982
params.use_jinja = true;
19831983
}
19841984
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
1985+
add_opt(common_arg(
1986+
{"--reasoning-format"}, "FORMAT",
1987+
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
1988+
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
1989+
"only supported for non-streamed responses",
1990+
[](common_params & params, const std::string & value) {
1991+
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
1992+
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
1993+
else { std::invalid_argument("invalid value"); }
1994+
}
1995+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
19851996
add_opt(common_arg(
19861997
{"--chat-template"}, "JINJA_TEMPLATE",
19871998
string_format(

‎common/chat.cpp‎

Lines changed: 219 additions & 100 deletions
Large diffs are not rendered by default.

‎common/chat.hpp‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct common_chat_inputs {
1919
bool stream;
2020
std::string grammar;
2121
bool add_generation_prompt = true;
22+
bool extract_reasoning = true;
2223
};
2324

2425
enum common_chat_format {
@@ -28,11 +29,13 @@ enum common_chat_format {
2829
COMMON_CHAT_FORMAT_LLAMA_3_X,
2930
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
3031
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
32+
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
3133
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
3234
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
3335
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
3436
COMMON_CHAT_FORMAT_HERMES_2_PRO,
3537
COMMON_CHAT_FORMAT_COMMAND_R7B,
38+
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
3639

3740
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
3841
};

‎common/common.h‎

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,11 @@ struct common_params_vocoder {
203203
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
204204
};
205205

206+
enum common_reasoning_format {
207+
COMMON_REASONING_FORMAT_NONE,
208+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
209+
};
210+
206211
struct common_params {
207212
int32_t n_predict = -1; // new tokens to predict
208213
int32_t n_ctx = 4096; // context size
@@ -347,6 +352,7 @@ struct common_params {
347352
std::string chat_template = ""; // NOLINT
348353
bool use_jinja = false; // NOLINT
349354
bool enable_chat_template = true;
355+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
350356

351357
std::vector<std::string> api_keys;
352358

@@ -624,7 +630,7 @@ struct common_chat_msg {
624630
std::string role;
625631
std::string content;
626632
std::vector<common_tool_call> tool_calls;
627-
std::string tool_plan = "";
633+
std::string reasoning_content = "";
628634
};
629635

630636
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid

‎common/sampling.cpp‎

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
151151

152152
lparams.no_perf = params.no_perf;
153153

154-
std::vector<const char *> trigger_words;
155-
trigger_words.reserve(params.grammar_trigger_words.size());
156-
for (const auto & str : params.grammar_trigger_words) {
157-
trigger_words.push_back(str.word.c_str());
158-
}
159-
160154
struct llama_sampler * grmr;
161155
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
162156
#ifdef LLAMA_USE_LLGUIDANCE
@@ -165,6 +159,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
165159
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
166160
#endif // LLAMA_USE_LLGUIDANCE
167161
} else {
162+
std::vector<const char *> trigger_words;
163+
trigger_words.reserve(params.grammar_trigger_words.size());
164+
for (const auto & str : params.grammar_trigger_words) {
165+
trigger_words.push_back(str.word.c_str());
166+
}
167+
168168
grmr = params.grammar_lazy
169169
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170170
trigger_words.data(), trigger_words.size(),

‎examples/server/README.md‎

Lines changed: 251 additions & 50 deletions
Large diffs are not rendered by default.

‎examples/server/server.cpp‎

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ struct slot_params {
173173
{"grammar_trigger_words", grammar_trigger_words},
174174
{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
175175
{"preserved_tokens", sampling.preserved_tokens},
176+
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
176177
{"samplers", samplers},
177178
{"speculative.n_max", speculative.n_max},
178179
{"speculative.n_min", speculative.n_min},
@@ -724,9 +725,19 @@ struct server_task_result_cmpl_final : server_task_result {
724725
msg.content = content;
725726
}
726727

727-
json tool_calls;
728+
json message {
729+
{"role", "assistant"},
730+
};
731+
if (!msg.reasoning_content.empty()) {
732+
message["reasoning_content"] = msg.reasoning_content;
733+
}
734+
if (msg.content.empty() && !msg.tool_calls.empty()) {
735+
message["content"] = json();
736+
} else {
737+
message["content"] = msg.content;
738+
}
728739
if (!msg.tool_calls.empty()) {
729-
tool_calls = json::array();
740+
autotool_calls = json::array();
730741
for (const auto & tc : msg.tool_calls) {
731742
tool_calls.push_back({
732743
{"type", "function"},
@@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {
737748
{"id", tc.id},
738749
});
739750
}
740-
}
741-
742-
json message {
743-
{"content", msg.content},
744-
{"tool_calls", tool_calls},
745-
{"role", "assistant"},
746-
};
747-
if (!msg.tool_plan.empty()) {
748-
message["tool_plan"] = msg.tool_plan;
751+
message["tool_calls"] = tool_calls;
749752
}
750753

751754
json choice {
@@ -4060,7 +4063,7 @@ int main(int argc, char ** argv) {
40604063
}
40614064

40624065
auto body = json::parse(req.body);
4063-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4066+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
40644067

40654068
return handle_completions_impl(
40664069
SERVER_TASK_TYPE_COMPLETION,
@@ -4073,7 +4076,7 @@ int main(int argc, char ** argv) {
40734076
// same with handle_chat_completions, but without inference part
40744077
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
40754078
auto body = json::parse(req.body);
4076-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4079+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
40774080
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
40784081
};
40794082

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /