From 2d4b3735e44ac242956adc73bd5ed0c69338f407 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 12 Jul 2024 10:38:40 +0200
Subject: [PATCH] fix everything

---
 bindings/python/src/tokenizer.rs             |  1 -
 tokenizers/src/tokenizer/added_vocabulary.rs | 20 ++++++++------------
 tokenizers/src/tokenizer/mod.rs              |  2 --
 3 files changed, 8 insertions(+), 15 deletions(-)
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index a2b191673..9b0b82dcf 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -1290,7 +1290,6 @@ impl PyTokenizer {
             processed_old_tokens.push(old_token);
             processed_new_tokens.push(new_token);
         }
-
         Ok(self
             .tokenizer
             .assign_tokens(&processed_old_tokens, &processed_new_tokens))
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
index 144f2c44e..84f4927a7 100644
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -322,7 +322,7 @@ impl AddedVocabulary {
                     .lock()
                     .unwrap()
                     .entry(id)
-                    .and_modify(|t| *t = new.clone());
+                    .and_modify(|t| t.content = new.content.clone());
                 self.refresh_added_tokens(model, normalizer);
             } else {
                 error!("Error: you tried to re-assign a token that does not exist in the added vocab. Make sure {:?} is first added to the vocab", old.content.clone())
@@ -336,17 +336,12 @@ impl AddedVocabulary {
     /// non-normalized string, and one matching against the normalized one.
     fn refresh_added_tokens<N: Normalizer>(&mut self, model: &impl Model, normalizer: Option<&N>) {
         type TupleTokenId<'a> = (&'a AddedToken, u32);
-        let (normalized, non_normalized): (Vec<TupleTokenId>, Vec<TupleTokenId>) = self
-            .added_tokens
-            .iter()
-            .map(|token| {
-                (
-                    token,
-                    self.token_to_id(&token.content, model)
-                        .expect("Missing additional token"),
-                )
-            })
-            .partition(|(token, _)| token.normalized);
+        let added_tokens_map_r = self.added_tokens_map_r.lock().unwrap().clone();
+        let (normalized, non_normalized): (Vec<TupleTokenId>, Vec<TupleTokenId>) =
+            added_tokens_map_r
+                .iter()
+                .map(|(id, token)| (token, *id))
+                .partition(|(token, _)| token.normalized);
 
         let (tokens, ids): (Vec<&AddedToken>, Vec<u32>) = non_normalized.into_iter().unzip();
         let trie = AhoCorasickBuilder::new()
@@ -363,6 +358,7 @@ impl AddedVocabulary {
                 if let Some(n) = normalizer {
                     n.normalize(&mut content).unwrap();
                 }
+                println!("{:?}", token);
                 content
             })
             .collect();
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index aee256a42..c6433dc43 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -541,9 +541,7 @@ where
             model,
             post_processor: None,
             decoder: None,
-
             added_vocabulary: AddedVocabulary::new(),
-
             truncation: None,
             padding: None,
         }