xserv-chat: render gpt-oss analysis as a Qwen3-style <think> block
The gpt-oss harmony `analysis` channel is the model's reasoning, analogous to Qwen3's <think>. With --think, wrap it in a `<think>\n…\n</think>\n\n` block (gray when color is on, like Qwen3) and then print the final-channel answer; without --think, suppress the analysis and show only the answer. Replaces the previous color-gated behavior (analysis shown gray only on a TTY, with no markers). Analysis is still excluded from the multi-turn history (answer_ids), so re-prefill drops CoT as before. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -385,7 +385,7 @@ fn main() {
|
||||
io::stdout().flush().unwrap();
|
||||
let (_finish, answer) = generate_with_paged_cache(
|
||||
&model, &mut cache, &tokenizer, &prompt_tokens, &opts.sampling,
|
||||
max_new_tokens, use_color, &tp_handle, is_moe,
|
||||
max_new_tokens, use_color, &tp_handle, is_moe, opts.enable_thinking,
|
||||
);
|
||||
moe_history.push((input.to_string(), answer));
|
||||
println!();
|
||||
@@ -429,6 +429,7 @@ fn main() {
|
||||
use_color,
|
||||
&tp_handle,
|
||||
is_moe,
|
||||
opts.enable_thinking,
|
||||
);
|
||||
match finish {
|
||||
Finish::Stop { token_id } => {
|
||||
@@ -688,6 +689,7 @@ fn generate_with_paged_cache(
|
||||
use_color: bool,
|
||||
tp: &Option<TpHandle>,
|
||||
is_moe: bool,
|
||||
enable_thinking: bool,
|
||||
) -> (Finish, String) {
|
||||
let harmony_end_id = if is_moe { tokenizer.special_token_id("<|end|>") } else { None };
|
||||
let harmony_channel_id = if is_moe { tokenizer.special_token_id("<|channel|>") } else { None };
|
||||
@@ -764,6 +766,12 @@ fn generate_with_paged_cache(
|
||||
io::stdout().flush().unwrap();
|
||||
return (Finish::Stop { token_id: next }, tokenizer.decode(&answer_ids));
|
||||
}
|
||||
// Closing a thinking (analysis/commentary) channel: emit the </think>
|
||||
// marker so it renders like Qwen3's thinking block.
|
||||
if is_moe && enable_thinking && hstate == HarmonyState::InAnalysis {
|
||||
print_stream_text("\n</think>\n\n", true, use_color);
|
||||
in_thinking = false;
|
||||
}
|
||||
hstate = HarmonyState::Normal;
|
||||
next = pick(&logits, sampling, &history);
|
||||
continue;
|
||||
@@ -795,7 +803,11 @@ fn generate_with_paged_cache(
|
||||
in_thinking = false;
|
||||
} else {
|
||||
hstate = HarmonyState::InAnalysis;
|
||||
in_thinking = use_color; // render analysis as gray
|
||||
// Open a Qwen3-style thinking block for the analysis channel.
|
||||
if is_moe && enable_thinking {
|
||||
print_stream_text("<think>\n", true, use_color);
|
||||
in_thinking = use_color; // render analysis content as gray
|
||||
}
|
||||
}
|
||||
next = pick(&logits, sampling, &history);
|
||||
continue;
|
||||
@@ -805,8 +817,9 @@ fn generate_with_paged_cache(
|
||||
continue;
|
||||
}
|
||||
if hstate == HarmonyState::InAnalysis {
|
||||
// Analysis channel: render as thinking (gray) if color enabled, skip if not
|
||||
if use_color {
|
||||
// Analysis channel = the model's reasoning. With --think, show it as a
|
||||
// thinking block (gray if color); otherwise suppress it (answer only).
|
||||
if is_moe && enable_thinking {
|
||||
print_generated_token(tokenizer, next, &mut decode_buffer, &mut in_thinking, use_color);
|
||||
}
|
||||
next = pick(&logits, sampling, &history);
|
||||
|
||||
Reference in New Issue
Block a user