xserv-chat: fix unclosed <think> on early termination and flush analysis tokens

Close the <think> block when EOS or max_tokens interrupts an analysis
channel, and flush stdout after each analysis token so --think streams
smoothly instead of dumping in buffer-sized chunks.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-03 01:01:41 +08:00
parent 34e9bee375
commit e1eb77baa4

View File

@@ -735,6 +735,7 @@ fn generate_with_paged_cache(
let mut next = pick(&logits, sampling, &history);
let mut decode_buffer = Vec::new();
let mut in_thinking = false;
let show_thinking = is_moe && enable_thinking;
// Visible answer tokens, returned for multi-turn history. For moe this is
// the final-channel content only (analysis is suppressed/gray); for Qwen3
// it is everything printed. The caller decodes these into the assistant
@@ -752,6 +753,9 @@ fn generate_with_paged_cache(
in_thinking,
use_color,
);
if show_thinking && in_thinking {
print_stream_text("\n</think>\n\n", true, use_color);
}
io::stdout().flush().unwrap();
return (Finish::Stop { token_id: next }, tokenizer.decode(&answer_ids));
}
@@ -768,7 +772,7 @@ fn generate_with_paged_cache(
}
// Closing a thinking (analysis/commentary) channel: emit the </think>
// marker so it renders like Qwen3's thinking block.
if is_moe && enable_thinking && hstate == HarmonyState::InAnalysis {
if show_thinking && hstate == HarmonyState::InAnalysis {
print_stream_text("\n</think>\n\n", true, use_color);
in_thinking = false;
}
@@ -804,9 +808,9 @@ fn generate_with_paged_cache(
} else {
hstate = HarmonyState::InAnalysis;
// Open a Qwen3-style thinking block for the analysis channel.
if is_moe && enable_thinking {
if show_thinking {
print_stream_text("<think>\n", true, use_color);
in_thinking = use_color; // render analysis content as gray
in_thinking = true;
}
}
next = pick(&logits, sampling, &history);
@@ -819,8 +823,9 @@ fn generate_with_paged_cache(
if hstate == HarmonyState::InAnalysis {
// Analysis channel = the model's reasoning. With --think, show it as a
// thinking block (gray if color); otherwise suppress it (answer only).
if is_moe && enable_thinking {
if show_thinking {
print_generated_token(tokenizer, next, &mut decode_buffer, &mut in_thinking, use_color);
io::stdout().flush().unwrap();
}
next = pick(&logits, sampling, &history);
continue;
@@ -851,6 +856,9 @@ fn generate_with_paged_cache(
in_thinking,
use_color,
);
if show_thinking && in_thinking {
print_stream_text("\n</think>\n\n", true, use_color);
}
io::stdout().flush().unwrap();
(Finish::Length, tokenizer.decode(&answer_ids))
}