From 2d4b3735e44ac242956adc73bd5ed0c69338f407 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 12 Jul 2024 10:38:40 +0200 Subject: [PATCH] fix everything --- bindings/python/src/tokenizer.rs | 1 - tokenizers/src/tokenizer/added_vocabulary.rs | 20 ++++++++------------ tokenizers/src/tokenizer/mod.rs | 2 -- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index a2b191673..9b0b82dcf 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1290,7 +1290,6 @@ impl PyTokenizer { processed_old_tokens.push(old_token); processed_new_tokens.push(new_token); } - Ok(self .tokenizer .assign_tokens(&processed_old_tokens, &processed_new_tokens)) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 144f2c44e..84f4927a7 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -322,7 +322,7 @@ impl AddedVocabulary { .lock() .unwrap() .entry(id) - .and_modify(|t| *t = new.clone()); + .and_modify(|t| t.content = new.content.clone()); self.refresh_added_tokens(model, normalizer); } else { error!("Error: you tried to re-assign a token that does not exist in the added vocab. Make sure {:?} is first added to the vocab", old.content.clone()) @@ -336,17 +336,12 @@ impl AddedVocabulary { /// non-normalized string, and one matching against the normalized one. fn refresh_added_tokens(&mut self, model: &impl Model, normalizer: Option<&N>) { type TupleTokenId<'a> = (&'a AddedToken, u32); - let (normalized, non_normalized): (Vec, Vec) = self - .added_tokens - .iter() - .map(|token| { - ( - token, - self.token_to_id(&token.content, model) - .expect("Missing additional token"), - ) - }) - .partition(|(token, _)| token.normalized); + let added_tokens_map_r = self.added_tokens_map_r.lock().unwrap().clone(); + let (normalized, non_normalized): (Vec, Vec) = + added_tokens_map_r + .iter() + .map(|(id, token)| (token, *id)) + .partition(|(token, _)| token.normalized); let (tokens, ids): (Vec<&AddedToken>, Vec) = non_normalized.into_iter().unzip(); let trie = AhoCorasickBuilder::new() @@ -363,6 +358,7 @@ impl AddedVocabulary { if let Some(n) = normalizer { n.normalize(&mut content).unwrap(); } + println!("{:?}", token); content }) .collect(); diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index aee256a42..c6433dc43 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -541,9 +541,7 @@ where model, post_processor: None, decoder: None, - added_vocabulary: AddedVocabulary::new(), - truncation: None, padding: None, }