xserv-chat: fix unclosed <think> on early termination and flush analysis tokens

Close the <think> block when EOS or max_tokens interrupts an analysis channel, and flush stdout after each analysis token so --think streams smoothly instead of dumping in buffer-sized chunks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-03 01:01:41 +08:00
parent 34e9bee375
commit e1eb77baa4
1 changed files with 12 additions and 4 deletions
--- a/crates/xserv-model/src/bin/xserv-chat.rs
+++ b/crates/xserv-model/src/bin/xserv-chat.rs
@@ -735,6 +735,7 @@ fn generate_with_paged_cache(
    let mut next = pick(&logits, sampling, &history);
    let mut decode_buffer = Vec::new();
    let mut in_thinking = false;
+    let show_thinking = is_moe && enable_thinking;
    // Visible answer tokens, returned for multi-turn history. For moe this is
    // the final-channel content only (analysis is suppressed/gray); for Qwen3
    // it is everything printed. The caller decodes these into the assistant
@@ -752,6 +753,9 @@ fn generate_with_paged_cache(
                in_thinking,
                use_color,
            );
+            if show_thinking && in_thinking {
+                print_stream_text("\n</think>\n\n", true, use_color);
+            }
            io::stdout().flush().unwrap();
            return (Finish::Stop { token_id: next }, tokenizer.decode(&answer_ids));
        }
@@ -768,7 +772,7 @@ fn generate_with_paged_cache(
            }
            // Closing a thinking (analysis/commentary) channel: emit the </think>
            // marker so it renders like Qwen3's thinking block.
-            if is_moe && enable_thinking && hstate == HarmonyState::InAnalysis {
+            if show_thinking && hstate == HarmonyState::InAnalysis {
                print_stream_text("\n</think>\n\n", true, use_color);
                in_thinking = false;
            }
@@ -804,9 +808,9 @@ fn generate_with_paged_cache(
            } else {
                hstate = HarmonyState::InAnalysis;
                // Open a Qwen3-style thinking block for the analysis channel.
-                if is_moe && enable_thinking {
+                if show_thinking {
                    print_stream_text("<think>\n", true, use_color);
-                    in_thinking = use_color; // render analysis content as gray
+                    in_thinking = true;
                }
            }
            next = pick(&logits, sampling, &history);
@@ -819,8 +823,9 @@ fn generate_with_paged_cache(
        if hstate == HarmonyState::InAnalysis {
            // Analysis channel = the model's reasoning. With --think, show it as a
            // thinking block (gray if color); otherwise suppress it (answer only).
-            if is_moe && enable_thinking {
+            if show_thinking {
                print_generated_token(tokenizer, next, &mut decode_buffer, &mut in_thinking, use_color);
+                io::stdout().flush().unwrap();
            }
            next = pick(&logits, sampling, &history);
            continue;
@@ -851,6 +856,9 @@ fn generate_with_paged_cache(
        in_thinking,
        use_color,
    );
+    if show_thinking && in_thinking {
+        print_stream_text("\n</think>\n\n", true, use_color);
+    }
    io::stdout().flush().unwrap();
    (Finish::Length, tokenizer.decode(&answer_ids))
 }