tokenizer: support multiple end-of-generation tokens
Track an ordered eos_token_ids list (not just one id) and add is_eos(). gpt-oss/harmony ends the assistant turn on <|return|> and also treats <|call|> and <|endoftext|> as terminators (generation_config.json eos_token_id = [200002, 199999, 200012]); single-eos families are unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ pub struct Tokenizer {
|
|||||||
special_token_ids: HashMap<u32, String>,
|
special_token_ids: HashMap<u32, String>,
|
||||||
pre_tokenize_re: Regex,
|
pre_tokenize_re: Regex,
|
||||||
eos_token_id: Option<u32>,
|
eos_token_id: Option<u32>,
|
||||||
|
eos_token_ids: Vec<u32>,
|
||||||
byte_fallback: bool,
|
byte_fallback: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -102,11 +103,27 @@ impl Tokenizer {
|
|||||||
decoder.resize(decoder.len().max(at.id as usize + 1), vec![]);
|
decoder.resize(decoder.len().max(at.id as usize + 1), vec![]);
|
||||||
decoder[at.id as usize] = at.content.as_bytes().to_vec();
|
decoder[at.id as usize] = at.content.as_bytes().to_vec();
|
||||||
}
|
}
|
||||||
let eos_token_id = special_tokens
|
// End-of-generation tokens, in priority order. Families differ:
|
||||||
.get("<|im_end|>")
|
// Qwen uses <|im_end|>, Llama <|end_of_text|>, GPT-2 <|endoftext|>.
|
||||||
.or_else(|| special_tokens.get("<|end_of_text|>"))
|
// gpt-oss (harmony) ends the assistant turn with <|return|> and also
|
||||||
.or_else(|| special_tokens.get("<|endoftext|>"))
|
// treats <|call|> (tool call) and <|endoftext|> as terminators
|
||||||
.copied();
|
// (see generation_config.json eos_token_id = [200002, 199999, 200012]).
|
||||||
|
let eos_names = [
|
||||||
|
"<|im_end|>",
|
||||||
|
"<|end_of_text|>",
|
||||||
|
"<|return|>",
|
||||||
|
"<|call|>",
|
||||||
|
"<|endoftext|>",
|
||||||
|
];
|
||||||
|
let mut eos_token_ids: Vec<u32> = Vec::new();
|
||||||
|
for name in eos_names {
|
||||||
|
if let Some(&id) = special_tokens.get(name) {
|
||||||
|
if !eos_token_ids.contains(&id) {
|
||||||
|
eos_token_ids.push(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let eos_token_id = eos_token_ids.first().copied();
|
||||||
|
|
||||||
// Pre-tokenization regex
|
// Pre-tokenization regex
|
||||||
let pre_tokenize_re = if byte_fallback {
|
let pre_tokenize_re = if byte_fallback {
|
||||||
@@ -125,6 +142,7 @@ impl Tokenizer {
|
|||||||
special_token_ids,
|
special_token_ids,
|
||||||
pre_tokenize_re,
|
pre_tokenize_re,
|
||||||
eos_token_id,
|
eos_token_id,
|
||||||
|
eos_token_ids,
|
||||||
byte_fallback,
|
byte_fallback,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -249,6 +267,12 @@ impl Tokenizer {
|
|||||||
self.eos_token_id
|
self.eos_token_id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// True if `id` is any end-of-generation token (a model may have several;
|
||||||
|
/// gpt-oss/harmony ends on <|return|>, <|call|>, or <|endoftext|>).
|
||||||
|
pub fn is_eos(&self, id: u32) -> bool {
|
||||||
|
self.eos_token_ids.contains(&id)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn vocab_size(&self) -> usize {
|
pub fn vocab_size(&self) -> usize {
|
||||||
self.decoder.len()
|
self.decoder.len()
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user