tokenizer: read pre-tokenizer regex from tokenizer.json
Parse the model's `pre_tokenizer` section to extract its Split regex instead of hardcoding the GPT-2 pattern. The gpt-oss-20b model uses a GPT-4-style regex that produces different word boundaries, causing a 1-token prompt mismatch vs HuggingFace (136 → 135 tokens, now aligned). Unsupported lookahead `(?!\S)` is stripped — it only affects trailing whitespace edge cases. Falls back to the old GPT-2/Qwen heuristic if the model regex fails to compile or is absent. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,24 @@ struct TokenizerJson {
|
||||
model: ModelSection,
|
||||
#[serde(default)]
|
||||
added_tokens: Vec<AddedToken>,
|
||||
#[serde(default)]
|
||||
pre_tokenizer: Option<PreTokenizerSection>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct PreTokenizerSection {
|
||||
#[serde(default, rename = "type")]
|
||||
kind: Option<String>,
|
||||
#[serde(default)]
|
||||
pattern: Option<PatternSpec>,
|
||||
#[serde(default)]
|
||||
pretokenizers: Option<Vec<PreTokenizerSection>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct PatternSpec {
|
||||
#[serde(rename = "Regex")]
|
||||
regex: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
@@ -125,12 +143,44 @@ impl Tokenizer {
|
||||
}
|
||||
let eos_token_id = eos_token_ids.first().copied();
|
||||
|
||||
// Pre-tokenization regex
|
||||
let pre_tokenize_re = if byte_fallback {
|
||||
// Qwen-style: split on whitespace boundaries, keep Unicode words/numbers
|
||||
// Pre-tokenization regex: prefer the model's own regex from tokenizer.json,
|
||||
// fall back to GPT-2/Qwen heuristic if not present or unsupported.
|
||||
let model_regex = tj.pre_tokenizer.as_ref().and_then(|pt| {
|
||||
// Direct Split with regex
|
||||
if pt.kind.as_deref() == Some("Split") {
|
||||
return pt.pattern.as_ref().and_then(|p| p.regex.clone());
|
||||
}
|
||||
// Sequence → find the Split entry
|
||||
if let Some(subs) = &pt.pretokenizers {
|
||||
for sub in subs {
|
||||
if sub.kind.as_deref() == Some("Split") {
|
||||
if let Some(r) = sub.pattern.as_ref().and_then(|p| p.regex.clone()) {
|
||||
return Some(r);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
});
|
||||
|
||||
let pre_tokenize_re = if let Some(ref pat) = model_regex {
|
||||
// Strip unsupported lookahead (?!\S) — Rust regex doesn't support it.
|
||||
// The lookahead only affects trailing-whitespace edge cases.
|
||||
let cleaned = pat.replace(r"(?!\S)", "");
|
||||
match Regex::new(&cleaned) {
|
||||
Ok(re) => re,
|
||||
Err(e) => {
|
||||
eprintln!("warning: model pre_tokenizer regex failed ({e}), using fallback");
|
||||
if byte_fallback {
|
||||
Regex::new(r"[\p{L}\p{N}]+|[^\s\p{L}\p{N}]|\s+").unwrap()
|
||||
} else {
|
||||
Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+").unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if byte_fallback {
|
||||
Regex::new(r"[\p{L}\p{N}]+|[^\s\p{L}\p{N}]|\s+").unwrap()
|
||||
} else {
|
||||
// GPT-2 style
|
||||
Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+").unwrap()
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user