tokenizer: support multiple end-of-generation tokens
Track an ordered eos_token_ids list (not just one id) and add is_eos(). gpt-oss/harmony ends the assistant turn on <|return|> and also treats <|call|> and <|endoftext|> as terminators (generation_config.json eos_token_id = [200002, 199999, 200012]); single-eos families are unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ pub struct Tokenizer {
|
||||
special_token_ids: HashMap<u32, String>,
|
||||
pre_tokenize_re: Regex,
|
||||
eos_token_id: Option<u32>,
|
||||
eos_token_ids: Vec<u32>,
|
||||
byte_fallback: bool,
|
||||
}
|
||||
|
||||
@@ -102,11 +103,27 @@ impl Tokenizer {
|
||||
decoder.resize(decoder.len().max(at.id as usize + 1), vec![]);
|
||||
decoder[at.id as usize] = at.content.as_bytes().to_vec();
|
||||
}
|
||||
let eos_token_id = special_tokens
|
||||
.get("<|im_end|>")
|
||||
.or_else(|| special_tokens.get("<|end_of_text|>"))
|
||||
.or_else(|| special_tokens.get("<|endoftext|>"))
|
||||
.copied();
|
||||
// End-of-generation tokens, in priority order. Families differ:
|
||||
// Qwen uses <|im_end|>, Llama <|end_of_text|>, GPT-2 <|endoftext|>.
|
||||
// gpt-oss (harmony) ends the assistant turn with <|return|> and also
|
||||
// treats <|call|> (tool call) and <|endoftext|> as terminators
|
||||
// (see generation_config.json eos_token_id = [200002, 199999, 200012]).
|
||||
let eos_names = [
|
||||
"<|im_end|>",
|
||||
"<|end_of_text|>",
|
||||
"<|return|>",
|
||||
"<|call|>",
|
||||
"<|endoftext|>",
|
||||
];
|
||||
let mut eos_token_ids: Vec<u32> = Vec::new();
|
||||
for name in eos_names {
|
||||
if let Some(&id) = special_tokens.get(name) {
|
||||
if !eos_token_ids.contains(&id) {
|
||||
eos_token_ids.push(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
let eos_token_id = eos_token_ids.first().copied();
|
||||
|
||||
// Pre-tokenization regex
|
||||
let pre_tokenize_re = if byte_fallback {
|
||||
@@ -125,6 +142,7 @@ impl Tokenizer {
|
||||
special_token_ids,
|
||||
pre_tokenize_re,
|
||||
eos_token_id,
|
||||
eos_token_ids,
|
||||
byte_fallback,
|
||||
}
|
||||
}
|
||||
@@ -249,6 +267,12 @@ impl Tokenizer {
|
||||
self.eos_token_id
|
||||
}
|
||||
|
||||
/// True if `id` is any end-of-generation token (a model may have several;
|
||||
/// gpt-oss/harmony ends on <|return|>, <|call|>, or <|endoftext|>).
|
||||
pub fn is_eos(&self, id: u32) -> bool {
|
||||
self.eos_token_ids.contains(&id)
|
||||
}
|
||||
|
||||
pub fn vocab_size(&self) -> usize {
|
||||
self.decoder.len()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user