tokenizer: support multiple end-of-generation tokens

Track an ordered eos_token_ids list (not just one id) and add is_eos().
gpt-oss/harmony ends the assistant turn on <|return|> and also treats
<|call|> and <|endoftext|> as terminators (generation_config.json
eos_token_id = [200002, 199999, 200012]); single-eos families are
unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 00:56:21 +08:00
parent 9c98c169ff
commit e11f15e009

View File

@@ -12,6 +12,7 @@ pub struct Tokenizer {
special_token_ids: HashMap<u32, String>, special_token_ids: HashMap<u32, String>,
pre_tokenize_re: Regex, pre_tokenize_re: Regex,
eos_token_id: Option<u32>, eos_token_id: Option<u32>,
eos_token_ids: Vec<u32>,
byte_fallback: bool, byte_fallback: bool,
} }
@@ -102,11 +103,27 @@ impl Tokenizer {
decoder.resize(decoder.len().max(at.id as usize + 1), vec![]); decoder.resize(decoder.len().max(at.id as usize + 1), vec![]);
decoder[at.id as usize] = at.content.as_bytes().to_vec(); decoder[at.id as usize] = at.content.as_bytes().to_vec();
} }
let eos_token_id = special_tokens // End-of-generation tokens, in priority order. Families differ:
.get("<|im_end|>") // Qwen uses <|im_end|>, Llama <|end_of_text|>, GPT-2 <|endoftext|>.
.or_else(|| special_tokens.get("<|end_of_text|>")) // gpt-oss (harmony) ends the assistant turn with <|return|> and also
.or_else(|| special_tokens.get("<|endoftext|>")) // treats <|call|> (tool call) and <|endoftext|> as terminators
.copied(); // (see generation_config.json eos_token_id = [200002, 199999, 200012]).
let eos_names = [
"<|im_end|>",
"<|end_of_text|>",
"<|return|>",
"<|call|>",
"<|endoftext|>",
];
let mut eos_token_ids: Vec<u32> = Vec::new();
for name in eos_names {
if let Some(&id) = special_tokens.get(name) {
if !eos_token_ids.contains(&id) {
eos_token_ids.push(id);
}
}
}
let eos_token_id = eos_token_ids.first().copied();
// Pre-tokenization regex // Pre-tokenization regex
let pre_tokenize_re = if byte_fallback { let pre_tokenize_re = if byte_fallback {
@@ -125,6 +142,7 @@ impl Tokenizer {
special_token_ids, special_token_ids,
pre_tokenize_re, pre_tokenize_re,
eos_token_id, eos_token_id,
eos_token_ids,
byte_fallback, byte_fallback,
} }
} }
@@ -249,6 +267,12 @@ impl Tokenizer {
self.eos_token_id self.eos_token_id
} }
/// True if `id` is any end-of-generation token (a model may have several;
/// gpt-oss/harmony ends on <|return|>, <|call|>, or <|endoftext|>).
pub fn is_eos(&self, id: u32) -> bool {
self.eos_token_ids.contains(&id)
}
pub fn vocab_size(&self) -> usize { pub fn vocab_size(&self) -> usize {
self.decoder.len() self.decoder.len()
} }