diff --git a/crates/xserv-model/src/bin/xserv-chat.rs b/crates/xserv-model/src/bin/xserv-chat.rs index 4dbe105..71a6fe3 100644 --- a/crates/xserv-model/src/bin/xserv-chat.rs +++ b/crates/xserv-model/src/bin/xserv-chat.rs @@ -385,7 +385,7 @@ fn main() { io::stdout().flush().unwrap(); let (_finish, answer) = generate_with_paged_cache( &model, &mut cache, &tokenizer, &prompt_tokens, &opts.sampling, - max_new_tokens, use_color, &tp_handle, is_moe, + max_new_tokens, use_color, &tp_handle, is_moe, opts.enable_thinking, ); moe_history.push((input.to_string(), answer)); println!(); @@ -429,6 +429,7 @@ fn main() { use_color, &tp_handle, is_moe, + opts.enable_thinking, ); match finish { Finish::Stop { token_id } => { @@ -688,6 +689,7 @@ fn generate_with_paged_cache( use_color: bool, tp: &Option, is_moe: bool, + enable_thinking: bool, ) -> (Finish, String) { let harmony_end_id = if is_moe { tokenizer.special_token_id("<|end|>") } else { None }; let harmony_channel_id = if is_moe { tokenizer.special_token_id("<|channel|>") } else { None }; @@ -764,6 +766,12 @@ fn generate_with_paged_cache( io::stdout().flush().unwrap(); return (Finish::Stop { token_id: next }, tokenizer.decode(&answer_ids)); } + // Closing a thinking (analysis/commentary) channel: emit the + // marker so it renders like Qwen3's thinking block. + if is_moe && enable_thinking && hstate == HarmonyState::InAnalysis { + print_stream_text("\n\n\n", true, use_color); + in_thinking = false; + } hstate = HarmonyState::Normal; next = pick(&logits, sampling, &history); continue; @@ -795,7 +803,11 @@ fn generate_with_paged_cache( in_thinking = false; } else { hstate = HarmonyState::InAnalysis; - in_thinking = use_color; // render analysis as gray + // Open a Qwen3-style thinking block for the analysis channel. + if is_moe && enable_thinking { + print_stream_text("\n", true, use_color); + in_thinking = use_color; // render analysis content as gray + } } next = pick(&logits, sampling, &history); continue; @@ -805,8 +817,9 @@ fn generate_with_paged_cache( continue; } if hstate == HarmonyState::InAnalysis { - // Analysis channel: render as thinking (gray) if color enabled, skip if not - if use_color { + // Analysis channel = the model's reasoning. With --think, show it as a + // thinking block (gray if color); otherwise suppress it (answer only). + if is_moe && enable_thinking { print_generated_token(tokenizer, next, &mut decode_buffer, &mut in_thinking, use_color); } next = pick(&logits, sampling, &history);