xserv-chat: render gpt-oss analysis as a Qwen3-style <think> block

The gpt-oss harmony `analysis` channel is the model's reasoning, analogous
to Qwen3's <think>. With --think, wrap it in a `<think>\n…\n</think>\n\n`
block (gray when color is on, like Qwen3) and then print the final-channel
answer; without --think, suppress the analysis and show only the answer.
Replaces the previous color-gated behavior (analysis shown gray only on a
TTY, with no markers). Analysis is still excluded from the multi-turn
history (answer_ids), so re-prefill drops CoT as before.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-02 21:37:28 +08:00
parent 3b9e32e6cd
commit 34e9bee375

View File

@@ -385,7 +385,7 @@ fn main() {
io::stdout().flush().unwrap();
let (_finish, answer) = generate_with_paged_cache(
&model, &mut cache, &tokenizer, &prompt_tokens, &opts.sampling,
max_new_tokens, use_color, &tp_handle, is_moe,
max_new_tokens, use_color, &tp_handle, is_moe, opts.enable_thinking,
);
moe_history.push((input.to_string(), answer));
println!();
@@ -429,6 +429,7 @@ fn main() {
use_color,
&tp_handle,
is_moe,
opts.enable_thinking,
);
match finish {
Finish::Stop { token_id } => {
@@ -688,6 +689,7 @@ fn generate_with_paged_cache(
use_color: bool,
tp: &Option<TpHandle>,
is_moe: bool,
enable_thinking: bool,
) -> (Finish, String) {
let harmony_end_id = if is_moe { tokenizer.special_token_id("<|end|>") } else { None };
let harmony_channel_id = if is_moe { tokenizer.special_token_id("<|channel|>") } else { None };
@@ -764,6 +766,12 @@ fn generate_with_paged_cache(
io::stdout().flush().unwrap();
return (Finish::Stop { token_id: next }, tokenizer.decode(&answer_ids));
}
// Closing a thinking (analysis/commentary) channel: emit the </think>
// marker so it renders like Qwen3's thinking block.
if is_moe && enable_thinking && hstate == HarmonyState::InAnalysis {
print_stream_text("\n</think>\n\n", true, use_color);
in_thinking = false;
}
hstate = HarmonyState::Normal;
next = pick(&logits, sampling, &history);
continue;
@@ -795,7 +803,11 @@ fn generate_with_paged_cache(
in_thinking = false;
} else {
hstate = HarmonyState::InAnalysis;
in_thinking = use_color; // render analysis as gray
// Open a Qwen3-style thinking block for the analysis channel.
if is_moe && enable_thinking {
print_stream_text("<think>\n", true, use_color);
in_thinking = use_color; // render analysis content as gray
}
}
next = pick(&logits, sampling, &history);
continue;
@@ -805,8 +817,9 @@ fn generate_with_paged_cache(
continue;
}
if hstate == HarmonyState::InAnalysis {
// Analysis channel: render as thinking (gray) if color enabled, skip if not
if use_color {
// Analysis channel = the model's reasoning. With --think, show it as a
// thinking block (gray if color); otherwise suppress it (answer only).
if is_moe && enable_thinking {
print_generated_token(tokenizer, next, &mut decode_buffer, &mut in_thinking, use_color);
}
next = pick(&logits, sampling, &history);