tokenizer: read pre-tokenizer regex from tokenizer.json

Parse the model's `pre_tokenizer` section to extract its Split regex
instead of hardcoding the GPT-2 pattern.  The gpt-oss-20b model uses
a GPT-4-style regex that produces different word boundaries, causing a
1-token prompt mismatch vs HuggingFace (136 → 135 tokens, now aligned).

Unsupported lookahead `(?!\S)` is stripped — it only affects trailing
whitespace edge cases.  Falls back to the old GPT-2/Qwen heuristic if
the model regex fails to compile or is absent.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Gahow Wang
2026-05-31 13:22:35 +08:00
parent 241009a96c
commit 377a04b81f

View File

@@ -21,6 +21,24 @@ struct TokenizerJson {
model: ModelSection,
#[serde(default)]
added_tokens: Vec<AddedToken>,
#[serde(default)]
pre_tokenizer: Option<PreTokenizerSection>,
}
#[derive(Deserialize)]
struct PreTokenizerSection {
#[serde(default, rename = "type")]
kind: Option<String>,
#[serde(default)]
pattern: Option<PatternSpec>,
#[serde(default)]
pretokenizers: Option<Vec<PreTokenizerSection>>,
}
#[derive(Deserialize)]
struct PatternSpec {
#[serde(rename = "Regex")]
regex: Option<String>,
}
#[derive(Deserialize)]
@@ -125,12 +143,44 @@ impl Tokenizer {
}
let eos_token_id = eos_token_ids.first().copied();
// Pre-tokenization regex
let pre_tokenize_re = if byte_fallback {
// Qwen-style: split on whitespace boundaries, keep Unicode words/numbers
// Pre-tokenization regex: prefer the model's own regex from tokenizer.json,
// fall back to GPT-2/Qwen heuristic if not present or unsupported.
let model_regex = tj.pre_tokenizer.as_ref().and_then(|pt| {
// Direct Split with regex
if pt.kind.as_deref() == Some("Split") {
return pt.pattern.as_ref().and_then(|p| p.regex.clone());
}
// Sequence → find the Split entry
if let Some(subs) = &pt.pretokenizers {
for sub in subs {
if sub.kind.as_deref() == Some("Split") {
if let Some(r) = sub.pattern.as_ref().and_then(|p| p.regex.clone()) {
return Some(r);
}
}
}
}
None
});
let pre_tokenize_re = if let Some(ref pat) = model_regex {
// Strip unsupported lookahead (?!\S) — Rust regex doesn't support it.
// The lookahead only affects trailing-whitespace edge cases.
let cleaned = pat.replace(r"(?!\S)", "");
match Regex::new(&cleaned) {
Ok(re) => re,
Err(e) => {
eprintln!("warning: model pre_tokenizer regex failed ({e}), using fallback");
if byte_fallback {
Regex::new(r"[\p{L}\p{N}]+|[^\s\p{L}\p{N}]|\s+").unwrap()
} else {
Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+").unwrap()
}
}
}
} else if byte_fallback {
Regex::new(r"[\p{L}\p{N}]+|[^\s\p{L}\p{N}]|\s+").unwrap()
} else {
// GPT-2 style
Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+").unwrap()
};