xserv-chat: fix gpt-oss harmony chat (canonical system prompt + routing)

The hand-rolled gpt-oss system message dropped the canonical harmony structure (identity / knowledge cutoff / current date / Reasoning level), putting the model out of distribution — greedy decoding then flipped into garbage or analysis loops on ~half of single-turn requests. Emit the canonical system message (matching the model's chat_template.jinja build_system_message macro) with Reasoning: low, plus a today_ymd() date helper. Also: - Default the repetition penalty to off (1.0). Penalizing the harmony control tokens (<|channel|>/<|message|>/<|start|>) that must repeat to open the final channel made gpt-oss stop right after analysis, emitting nothing. - Suppress the literal "assistant" role header emitted between the analysis and final channels (only print in the final channel, moe only; non-moe Qwen3 stays in Normal and prints as before). Verified on dash5 (TP=2): single-turn "capital of France" is now stable across runs with a clean final answer; Qwen3 chat unaffected. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 15:19:07 +08:00
parent f2e60218b4
commit 3d6bb1918e
1 changed files with 44 additions and 2 deletions
--- a/crates/xserv-model/src/bin/xserv-chat.rs
+++ b/crates/xserv-model/src/bin/xserv-chat.rs
@@ -582,8 +582,19 @@ fn build_turn_prompt_gpt_oss(
 ) -> String {
    let mut prompt = String::new();
    if include_system {
+        // Canonical harmony system message. gpt-oss was trained on this exact
+        // structure (identity / knowledge cutoff / current date / Reasoning
+        // level / channels — see the model's chat_template.jinja). A hand-rolled
+        // substitute puts the model out of distribution: channel selection
+        // destabilizes and greedy decoding flips into garbage or analysis loops
+        // that never reach the `final` channel. "Reasoning: low" keeps the
+        // analysis channel short for an interactive chat.
        prompt.push_str("<|start|>system<|message|>");
-        prompt.push_str("You are a helpful assistant.\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.");
+        prompt.push_str("You are ChatGPT, a large language model trained by OpenAI.\n");
+        prompt.push_str("Knowledge cutoff: 2024-06\n");
+        prompt.push_str(&format!("Current date: {}\n\n", today_ymd()));
+        prompt.push_str("Reasoning: low\n\n");
+        prompt.push_str("# Valid channels: analysis, commentary, final. Channel must be included for every message.");
        prompt.push_str("<|end|>");
        if let Some(sys) = system {
            if !sys.trim().is_empty() {
@@ -600,6 +611,24 @@ fn build_turn_prompt_gpt_oss(
    prompt
 }

+/// Current UTC date as "YYYY-MM-DD" for the harmony system message. Rata Die
+/// civil-calendar conversion (same algorithm the server uses for strftime_now).
+fn today_ymd() -> String {
+    use std::time::{SystemTime, UNIX_EPOCH};
+    let secs = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs();
+    let z = (secs / 86400) as i64 + 719468;
+    let era = (if z >= 0 { z } else { z - 146096 }) / 146097;
+    let doe = z - era * 146097;
+    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
+    let y = yoe + era * 400;
+    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
+    let mp = (5 * doy + 2) / 153;
+    let d = doy - (153 * mp + 2) / 5 + 1;
+    let m = if mp < 10 { mp + 3 } else { mp - 9 };
+    let y = if m <= 2 { y + 1 } else { y };
+    format!("{y:04}-{m:02}-{d:02}")
+}
+
 fn generate_with_paged_cache(
    model: &ChatModel,
    cache: &mut PagedKVCache,
@@ -627,9 +656,14 @@ fn generate_with_paged_cache(
    enum HarmonyState { Normal, ReadingChannel, InAnalysis, InFinal }
    let mut hstate = if is_moe { HarmonyState::InFinal } else { HarmonyState::Normal };

+    // Off by default. A repetition penalty over a harmony stream penalizes the
+    // control tokens (<|channel|>, <|message|>, <|start|>) that MUST repeat to
+    // open the final channel — so a non-1.0 default makes gpt-oss stop right
+    // after the analysis block, before emitting any answer. Opt in via the env
+    // var if you want it for plain (non-harmony) generation.
    let rep_penalty: f32 = std::env::var("XSERV_REP_PENALTY").ok()
        .and_then(|s| s.parse().ok())
-        .unwrap_or(if is_moe { 1.3 } else { 1.0 });
+        .unwrap_or(1.0);
    let rep_window: usize = std::env::var("XSERV_REP_WINDOW").ok()
        .and_then(|s| s.parse().ok())
        .unwrap_or(512);
@@ -724,6 +758,14 @@ fn generate_with_paged_cache(
            next = pick(&logits, sampling, &history);
            continue;
        }
+        if is_moe && hstate != HarmonyState::InFinal {
+            // Between harmony messages (after a channel's <|end|>, before the
+            // next <|channel|>): the model emits a role header like "assistant".
+            // That's structural, not user-visible content — suppress it. Only
+            // for moe/harmony; non-moe (Qwen3) stays in Normal and prints here.
+            next = pick(&logits, sampling, &history);
+            continue;
+        }

        print_generated_token(
            tokenizer,