xserv-chat: fix unclosed <think> on early termination and flush analysis tokens
Close the <think> block when EOS or max_tokens interrupts an analysis channel, and flush stdout after each analysis token so --think streams smoothly instead of dumping in buffer-sized chunks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -735,6 +735,7 @@ fn generate_with_paged_cache(
|
||||
let mut next = pick(&logits, sampling, &history);
|
||||
let mut decode_buffer = Vec::new();
|
||||
let mut in_thinking = false;
|
||||
let show_thinking = is_moe && enable_thinking;
|
||||
// Visible answer tokens, returned for multi-turn history. For moe this is
|
||||
// the final-channel content only (analysis is suppressed/gray); for Qwen3
|
||||
// it is everything printed. The caller decodes these into the assistant
|
||||
@@ -752,6 +753,9 @@ fn generate_with_paged_cache(
|
||||
in_thinking,
|
||||
use_color,
|
||||
);
|
||||
if show_thinking && in_thinking {
|
||||
print_stream_text("\n</think>\n\n", true, use_color);
|
||||
}
|
||||
io::stdout().flush().unwrap();
|
||||
return (Finish::Stop { token_id: next }, tokenizer.decode(&answer_ids));
|
||||
}
|
||||
@@ -768,7 +772,7 @@ fn generate_with_paged_cache(
|
||||
}
|
||||
// Closing a thinking (analysis/commentary) channel: emit the </think>
|
||||
// marker so it renders like Qwen3's thinking block.
|
||||
if is_moe && enable_thinking && hstate == HarmonyState::InAnalysis {
|
||||
if show_thinking && hstate == HarmonyState::InAnalysis {
|
||||
print_stream_text("\n</think>\n\n", true, use_color);
|
||||
in_thinking = false;
|
||||
}
|
||||
@@ -804,9 +808,9 @@ fn generate_with_paged_cache(
|
||||
} else {
|
||||
hstate = HarmonyState::InAnalysis;
|
||||
// Open a Qwen3-style thinking block for the analysis channel.
|
||||
if is_moe && enable_thinking {
|
||||
if show_thinking {
|
||||
print_stream_text("<think>\n", true, use_color);
|
||||
in_thinking = use_color; // render analysis content as gray
|
||||
in_thinking = true;
|
||||
}
|
||||
}
|
||||
next = pick(&logits, sampling, &history);
|
||||
@@ -819,8 +823,9 @@ fn generate_with_paged_cache(
|
||||
if hstate == HarmonyState::InAnalysis {
|
||||
// Analysis channel = the model's reasoning. With --think, show it as a
|
||||
// thinking block (gray if color); otherwise suppress it (answer only).
|
||||
if is_moe && enable_thinking {
|
||||
if show_thinking {
|
||||
print_generated_token(tokenizer, next, &mut decode_buffer, &mut in_thinking, use_color);
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
next = pick(&logits, sampling, &history);
|
||||
continue;
|
||||
@@ -851,6 +856,9 @@ fn generate_with_paged_cache(
|
||||
in_thinking,
|
||||
use_color,
|
||||
);
|
||||
if show_thinking && in_thinking {
|
||||
print_stream_text("\n</think>\n\n", true, use_color);
|
||||
}
|
||||
io::stdout().flush().unwrap();
|
||||
(Finish::Length, tokenizer.decode(&answer_ids))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user