xserv-chat: render gpt-oss analysis as a Qwen3-style <think> block

The gpt-oss harmony `analysis` channel is the model's reasoning, analogous to Qwen3's <think>. With --think, wrap it in a `<think>\n…\n</think>\n\n` block (gray when color is on, like Qwen3) and then print the final-channel answer; without --think, suppress the analysis and show only the answer. Replaces the previous color-gated behavior (analysis shown gray only on a TTY, with no markers). Analysis is still excluded from the multi-turn history (answer_ids), so re-prefill drops CoT as before. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 21:37:28 +08:00
parent 3b9e32e6cd
commit 34e9bee375
1 changed files with 17 additions and 4 deletions
--- a/crates/xserv-model/src/bin/xserv-chat.rs
+++ b/crates/xserv-model/src/bin/xserv-chat.rs
@@ -385,7 +385,7 @@ fn main() {
            io::stdout().flush().unwrap();
            let (_finish, answer) = generate_with_paged_cache(
                &model, &mut cache, &tokenizer, &prompt_tokens, &opts.sampling,
-                max_new_tokens, use_color, &tp_handle, is_moe,
+                max_new_tokens, use_color, &tp_handle, is_moe, opts.enable_thinking,
            );
            moe_history.push((input.to_string(), answer));
            println!();
@@ -429,6 +429,7 @@ fn main() {
            use_color,
            &tp_handle,
            is_moe,
+            opts.enable_thinking,
        );
        match finish {
            Finish::Stop { token_id } => {
@@ -688,6 +689,7 @@ fn generate_with_paged_cache(
    use_color: bool,
    tp: &Option<TpHandle>,
    is_moe: bool,
+    enable_thinking: bool,
 ) -> (Finish, String) {
    let harmony_end_id = if is_moe { tokenizer.special_token_id("<|end|>") } else { None };
    let harmony_channel_id = if is_moe { tokenizer.special_token_id("<|channel|>") } else { None };
@@ -764,6 +766,12 @@ fn generate_with_paged_cache(
                io::stdout().flush().unwrap();
                return (Finish::Stop { token_id: next }, tokenizer.decode(&answer_ids));
            }
+            // Closing a thinking (analysis/commentary) channel: emit the </think>
+            // marker so it renders like Qwen3's thinking block.
+            if is_moe && enable_thinking && hstate == HarmonyState::InAnalysis {
+                print_stream_text("\n</think>\n\n", true, use_color);
+                in_thinking = false;
+            }
            hstate = HarmonyState::Normal;
            next = pick(&logits, sampling, &history);
            continue;
@@ -795,7 +803,11 @@ fn generate_with_paged_cache(
                in_thinking = false;
            } else {
                hstate = HarmonyState::InAnalysis;
-                in_thinking = use_color; // render analysis as gray
+                // Open a Qwen3-style thinking block for the analysis channel.
+                if is_moe && enable_thinking {
+                    print_stream_text("<think>\n", true, use_color);
+                    in_thinking = use_color; // render analysis content as gray
+                }
            }
            next = pick(&logits, sampling, &history);
            continue;
@@ -805,8 +817,9 @@ fn generate_with_paged_cache(
            continue;
        }
        if hstate == HarmonyState::InAnalysis {
-            // Analysis channel: render as thinking (gray) if color enabled, skip if not
-            if use_color {
+            // Analysis channel = the model's reasoning. With --think, show it as a
+            // thinking block (gray if color); otherwise suppress it (answer only).
+            if is_moe && enable_thinking {
                print_generated_token(tokenizer, next, &mut decode_buffer, &mut in_thinking, use_color);
            }
            next = pick(&logits, sampling, &history);