xserv-chat: fix gpt-oss harmony chat (canonical system prompt + routing)

The hand-rolled gpt-oss system message dropped the canonical harmony
structure (identity / knowledge cutoff / current date / Reasoning level),
putting the model out of distribution — greedy decoding then flipped into
garbage or analysis loops on ~half of single-turn requests. Emit the
canonical system message (matching the model's chat_template.jinja
build_system_message macro) with Reasoning: low, plus a today_ymd() date
helper.

Also:
- Default the repetition penalty to off (1.0). Penalizing the harmony
  control tokens (<|channel|>/<|message|>/<|start|>) that must repeat to
  open the final channel made gpt-oss stop right after analysis, emitting
  nothing.
- Suppress the literal "assistant" role header emitted between the
  analysis and final channels (only print in the final channel, moe only;
  non-moe Qwen3 stays in Normal and prints as before).

Verified on dash5 (TP=2): single-turn "capital of France" is now stable
across runs with a clean final answer; Qwen3 chat unaffected.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-02 15:19:07 +08:00
parent f2e60218b4
commit 3d6bb1918e

View File

@@ -582,8 +582,19 @@ fn build_turn_prompt_gpt_oss(
) -> String {
let mut prompt = String::new();
if include_system {
// Canonical harmony system message. gpt-oss was trained on this exact
// structure (identity / knowledge cutoff / current date / Reasoning
// level / channels — see the model's chat_template.jinja). A hand-rolled
// substitute puts the model out of distribution: channel selection
// destabilizes and greedy decoding flips into garbage or analysis loops
// that never reach the `final` channel. "Reasoning: low" keeps the
// analysis channel short for an interactive chat.
prompt.push_str("<|start|>system<|message|>");
prompt.push_str("You are a helpful assistant.\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.");
prompt.push_str("You are ChatGPT, a large language model trained by OpenAI.\n");
prompt.push_str("Knowledge cutoff: 2024-06\n");
prompt.push_str(&format!("Current date: {}\n\n", today_ymd()));
prompt.push_str("Reasoning: low\n\n");
prompt.push_str("# Valid channels: analysis, commentary, final. Channel must be included for every message.");
prompt.push_str("<|end|>");
if let Some(sys) = system {
if !sys.trim().is_empty() {
@@ -600,6 +611,24 @@ fn build_turn_prompt_gpt_oss(
prompt
}
/// Current UTC date as "YYYY-MM-DD" for the harmony system message. Rata Die
/// civil-calendar conversion (same algorithm the server uses for strftime_now).
fn today_ymd() -> String {
use std::time::{SystemTime, UNIX_EPOCH};
let secs = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs();
let z = (secs / 86400) as i64 + 719468;
let era = (if z >= 0 { z } else { z - 146096 }) / 146097;
let doe = z - era * 146097;
let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
let y = yoe + era * 400;
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
let mp = (5 * doy + 2) / 153;
let d = doy - (153 * mp + 2) / 5 + 1;
let m = if mp < 10 { mp + 3 } else { mp - 9 };
let y = if m <= 2 { y + 1 } else { y };
format!("{y:04}-{m:02}-{d:02}")
}
fn generate_with_paged_cache(
model: &ChatModel,
cache: &mut PagedKVCache,
@@ -627,9 +656,14 @@ fn generate_with_paged_cache(
enum HarmonyState { Normal, ReadingChannel, InAnalysis, InFinal }
let mut hstate = if is_moe { HarmonyState::InFinal } else { HarmonyState::Normal };
// Off by default. A repetition penalty over a harmony stream penalizes the
// control tokens (<|channel|>, <|message|>, <|start|>) that MUST repeat to
// open the final channel — so a non-1.0 default makes gpt-oss stop right
// after the analysis block, before emitting any answer. Opt in via the env
// var if you want it for plain (non-harmony) generation.
let rep_penalty: f32 = std::env::var("XSERV_REP_PENALTY").ok()
.and_then(|s| s.parse().ok())
.unwrap_or(if is_moe { 1.3 } else { 1.0 });
.unwrap_or(1.0);
let rep_window: usize = std::env::var("XSERV_REP_WINDOW").ok()
.and_then(|s| s.parse().ok())
.unwrap_or(512);
@@ -724,6 +758,14 @@ fn generate_with_paged_cache(
next = pick(&logits, sampling, &history);
continue;
}
if is_moe && hstate != HarmonyState::InFinal {
// Between harmony messages (after a channel's <|end|>, before the
// next <|channel|>): the model emits a role header like "assistant".
// That's structural, not user-visible content — suppress it. Only
// for moe/harmony; non-moe (Qwen3) stays in Normal and prints here.
next = pick(&logits, sampling, &history);
continue;
}
print_generated_token(
tokenizer,