tokenizer: read pre-tokenizer regex from tokenizer.json

Parse the model's `pre_tokenizer` section to extract its Split regex instead of hardcoding the GPT-2 pattern. The gpt-oss-20b model uses a GPT-4-style regex that produces different word boundaries, causing a 1-token prompt mismatch vs HuggingFace (136 → 135 tokens, now aligned). Unsupported lookahead `(?!\S)` is stripped — it only affects trailing whitespace edge cases. Falls back to the old GPT-2/Qwen heuristic if the model regex fails to compile or is absent. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-31 13:22:35 +08:00
parent 241009a96c
commit 377a04b81f
1 changed files with 54 additions and 4 deletions
--- a/crates/xserv-tokenizer/src/bpe.rs
+++ b/crates/xserv-tokenizer/src/bpe.rs
@@ -21,6 +21,24 @@ struct TokenizerJson {
    model: ModelSection,
    #[serde(default)]
    added_tokens: Vec<AddedToken>,
+    #[serde(default)]
+    pre_tokenizer: Option<PreTokenizerSection>,
+}
+
+#[derive(Deserialize)]
+struct PreTokenizerSection {
+    #[serde(default, rename = "type")]
+    kind: Option<String>,
+    #[serde(default)]
+    pattern: Option<PatternSpec>,
+    #[serde(default)]
+    pretokenizers: Option<Vec<PreTokenizerSection>>,
+}
+
+#[derive(Deserialize)]
+struct PatternSpec {
+    #[serde(rename = "Regex")]
+    regex: Option<String>,
 }

 #[derive(Deserialize)]
@@ -125,12 +143,44 @@ impl Tokenizer {
        }
        let eos_token_id = eos_token_ids.first().copied();

-        // Pre-tokenization regex
-        let pre_tokenize_re = if byte_fallback {
-            // Qwen-style: split on whitespace boundaries, keep Unicode words/numbers
+        // Pre-tokenization regex: prefer the model's own regex from tokenizer.json,
+        // fall back to GPT-2/Qwen heuristic if not present or unsupported.
+        let model_regex = tj.pre_tokenizer.as_ref().and_then(|pt| {
+            // Direct Split with regex
+            if pt.kind.as_deref() == Some("Split") {
+                return pt.pattern.as_ref().and_then(|p| p.regex.clone());
+            }
+            // Sequence → find the Split entry
+            if let Some(subs) = &pt.pretokenizers {
+                for sub in subs {
+                    if sub.kind.as_deref() == Some("Split") {
+                        if let Some(r) = sub.pattern.as_ref().and_then(|p| p.regex.clone()) {
+                            return Some(r);
+                        }
+                    }
+                }
+            }
+            None
+        });
+
+        let pre_tokenize_re = if let Some(ref pat) = model_regex {
+            // Strip unsupported lookahead (?!\S) — Rust regex doesn't support it.
+            // The lookahead only affects trailing-whitespace edge cases.
+            let cleaned = pat.replace(r"(?!\S)", "");
+            match Regex::new(&cleaned) {
+                Ok(re) => re,
+                Err(e) => {
+                    eprintln!("warning: model pre_tokenizer regex failed ({e}), using fallback");
+                    if byte_fallback {
+                        Regex::new(r"[\p{L}\p{N}]+|[^\s\p{L}\p{N}]|\s+").unwrap()
+                    } else {
+                        Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+").unwrap()
+                    }
+                }
+            }
+        } else if byte_fallback {
            Regex::new(r"[\p{L}\p{N}]+|[^\s\p{L}\p{N}]|\s+").unwrap()
        } else {
-            // GPT-2 style
            Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+").unwrap()
        };