xserv-chat: fix gpt-oss harmony chat (canonical system prompt + routing)
The hand-rolled gpt-oss system message dropped the canonical harmony structure (identity / knowledge cutoff / current date / Reasoning level), putting the model out of distribution — greedy decoding then flipped into garbage or analysis loops on ~half of single-turn requests. Emit the canonical system message (matching the model's chat_template.jinja build_system_message macro) with Reasoning: low, plus a today_ymd() date helper. Also: - Default the repetition penalty to off (1.0). Penalizing the harmony control tokens (<|channel|>/<|message|>/<|start|>) that must repeat to open the final channel made gpt-oss stop right after analysis, emitting nothing. - Suppress the literal "assistant" role header emitted between the analysis and final channels (only print in the final channel, moe only; non-moe Qwen3 stays in Normal and prints as before). Verified on dash5 (TP=2): single-turn "capital of France" is now stable across runs with a clean final answer; Qwen3 chat unaffected. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -582,8 +582,19 @@ fn build_turn_prompt_gpt_oss(
|
||||
) -> String {
|
||||
let mut prompt = String::new();
|
||||
if include_system {
|
||||
// Canonical harmony system message. gpt-oss was trained on this exact
|
||||
// structure (identity / knowledge cutoff / current date / Reasoning
|
||||
// level / channels — see the model's chat_template.jinja). A hand-rolled
|
||||
// substitute puts the model out of distribution: channel selection
|
||||
// destabilizes and greedy decoding flips into garbage or analysis loops
|
||||
// that never reach the `final` channel. "Reasoning: low" keeps the
|
||||
// analysis channel short for an interactive chat.
|
||||
prompt.push_str("<|start|>system<|message|>");
|
||||
prompt.push_str("You are a helpful assistant.\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.");
|
||||
prompt.push_str("You are ChatGPT, a large language model trained by OpenAI.\n");
|
||||
prompt.push_str("Knowledge cutoff: 2024-06\n");
|
||||
prompt.push_str(&format!("Current date: {}\n\n", today_ymd()));
|
||||
prompt.push_str("Reasoning: low\n\n");
|
||||
prompt.push_str("# Valid channels: analysis, commentary, final. Channel must be included for every message.");
|
||||
prompt.push_str("<|end|>");
|
||||
if let Some(sys) = system {
|
||||
if !sys.trim().is_empty() {
|
||||
@@ -600,6 +611,24 @@ fn build_turn_prompt_gpt_oss(
|
||||
prompt
|
||||
}
|
||||
|
||||
/// Current UTC date as "YYYY-MM-DD" for the harmony system message. Rata Die
|
||||
/// civil-calendar conversion (same algorithm the server uses for strftime_now).
|
||||
fn today_ymd() -> String {
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
let secs = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs();
|
||||
let z = (secs / 86400) as i64 + 719468;
|
||||
let era = (if z >= 0 { z } else { z - 146096 }) / 146097;
|
||||
let doe = z - era * 146097;
|
||||
let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
|
||||
let y = yoe + era * 400;
|
||||
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
|
||||
let mp = (5 * doy + 2) / 153;
|
||||
let d = doy - (153 * mp + 2) / 5 + 1;
|
||||
let m = if mp < 10 { mp + 3 } else { mp - 9 };
|
||||
let y = if m <= 2 { y + 1 } else { y };
|
||||
format!("{y:04}-{m:02}-{d:02}")
|
||||
}
|
||||
|
||||
fn generate_with_paged_cache(
|
||||
model: &ChatModel,
|
||||
cache: &mut PagedKVCache,
|
||||
@@ -627,9 +656,14 @@ fn generate_with_paged_cache(
|
||||
enum HarmonyState { Normal, ReadingChannel, InAnalysis, InFinal }
|
||||
let mut hstate = if is_moe { HarmonyState::InFinal } else { HarmonyState::Normal };
|
||||
|
||||
// Off by default. A repetition penalty over a harmony stream penalizes the
|
||||
// control tokens (<|channel|>, <|message|>, <|start|>) that MUST repeat to
|
||||
// open the final channel — so a non-1.0 default makes gpt-oss stop right
|
||||
// after the analysis block, before emitting any answer. Opt in via the env
|
||||
// var if you want it for plain (non-harmony) generation.
|
||||
let rep_penalty: f32 = std::env::var("XSERV_REP_PENALTY").ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(if is_moe { 1.3 } else { 1.0 });
|
||||
.unwrap_or(1.0);
|
||||
let rep_window: usize = std::env::var("XSERV_REP_WINDOW").ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(512);
|
||||
@@ -724,6 +758,14 @@ fn generate_with_paged_cache(
|
||||
next = pick(&logits, sampling, &history);
|
||||
continue;
|
||||
}
|
||||
if is_moe && hstate != HarmonyState::InFinal {
|
||||
// Between harmony messages (after a channel's <|end|>, before the
|
||||
// next <|channel|>): the model emits a role header like "assistant".
|
||||
// That's structural, not user-visible content — suppress it. Only
|
||||
// for moe/harmony; non-moe (Qwen3) stays in Normal and prints here.
|
||||
next = pick(&logits, sampling, &history);
|
||||
continue;
|
||||
}
|
||||
|
||||
print_generated_token(
|
||||
tokenizer,
|
||||
|
||||
Reference in New Issue
Block a user