tokenizer: support multiple end-of-generation tokens

Track an ordered eos_token_ids list (not just one id) and add is_eos(). gpt-oss/harmony ends the assistant turn on <|return|> and also treats <|call|> and <|endoftext|> as terminators (generation_config.json eos_token_id = [200002, 199999, 200012]); single-eos families are unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 00:56:21 +08:00
parent 9c98c169ff
commit e11f15e009
1 changed files with 29 additions and 5 deletions
--- a/crates/xserv-tokenizer/src/bpe.rs
+++ b/crates/xserv-tokenizer/src/bpe.rs
@@ -12,6 +12,7 @@ pub struct Tokenizer {
    special_token_ids: HashMap<u32, String>,
    pre_tokenize_re: Regex,
    eos_token_id: Option<u32>,
    eos_token_ids: Vec<u32>,
    byte_fallback: bool,
 }
@@ -102,11 +103,27 @@ impl Tokenizer {
            decoder.resize(decoder.len().max(at.id as usize + 1), vec![]);
            decoder[at.id as usize] = at.content.as_bytes().to_vec();
        }
-        let eos_token_id = special_tokens
+        // End-of-generation tokens, in priority order. Families differ:
-            .get("<|im_end|>")
+        // Qwen uses <|im_end|>, Llama <|end_of_text|>, GPT-2 <|endoftext|>.
-            .or_else(|| special_tokens.get("<|end_of_text|>"))
+        // gpt-oss (harmony) ends the assistant turn with <|return|> and also
-            .or_else(|| special_tokens.get("<|endoftext|>"))
+        // treats <|call|> (tool call) and <|endoftext|> as terminators
-            .copied();
+        // (see generation_config.json eos_token_id = [200002, 199999, 200012]).
        let eos_names = [
            "<|im_end|>",
            "<|end_of_text|>",
            "<|return|>",
            "<|call|>",
            "<|endoftext|>",
        ];
        let mut eos_token_ids: Vec<u32> = Vec::new();
        for name in eos_names {
            if let Some(&id) = special_tokens.get(name) {
                if !eos_token_ids.contains(&id) {
                    eos_token_ids.push(id);
                }
            }
        }
        let eos_token_id = eos_token_ids.first().copied();
        // Pre-tokenization regex
        let pre_tokenize_re = if byte_fallback {
@@ -125,6 +142,7 @@ impl Tokenizer {
            special_token_ids,
            pre_tokenize_re,
            eos_token_id,
            eos_token_ids,
            byte_fallback,
        }
    }
@@ -249,6 +267,12 @@ impl Tokenizer {
        self.eos_token_id
    }
    /// True if `id` is any end-of-generation token (a model may have several;
    /// gpt-oss/harmony ends on <|return|>, <|call|>, or <|endoftext|>).
    pub fn is_eos(&self, id: u32) -> bool {
        self.eos_token_ids.contains(&id)
    }
    pub fn vocab_size(&self) -> usize {
        self.decoder.len()
    }