Add tokenizer and convert model

Signed-off-by: Aisuko <[email protected]>
mudler · Oct 31, 2023 · 4c7f5ca · 4c7f5ca
1 parent 390d41d
commit 4c7f5ca
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 2 deletions.
diff --git a/backend/rust/backend-burn/src/pkg/convert.rs b/backend/rust/backend-burn/src/pkg/convert.rs
@@ -0,0 +1,61 @@
+use crate::models::{Llama, LlamaConfig};
+
+use burn::{
+    backend::tch::{TchBackend, TchDevice},
+    config::Config,
+    module::Module,
+    tensor::backend::Backend,
+};
+
+use burn::record::{BinFileRecorder, HalfPrecisionSettings, Recorder, RecorderError};
+
+use super::Loader;
+
+pub struct Convertion {}
+
+impl Convertion {
+    pub fn convert_llama_dump_to_model<B: Backend>(
+        dump_path: &str,
+        model_name: &str,
+        device: &B::Device,
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        let (llama, llama_conifg): (Llama<B>, LlamaConfig) =
+            Loader::load_llama_dmp(dump_path, device)?;
+        Convertion::save_llama_model_file(llama, model_name)?;
+        llama_conifg.save(&format!("{model_name}.cfg"))?;
+        Ok(())
+    }
+
+    pub fn save_llama_model_file<B: Backend>(
+        llama: Llama<B>,
+        name: &str,
+    ) -> Result<(), RecorderError> {
+        BinFileRecorder::<HalfPrecisionSettings>::new().record(llama.into_record(), name.into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_convertion() {
+        type Backend = TchBackend<f32>;
+        let device = TchDevice::Mps;
+
+        // get home env
+        let home = std::env::var("HOME").unwrap();
+
+        let dump_path = &format!("{}/Downloads/workspace/llama/tokenizer.model", home);
+        let model_name = "llama2-7b-chat";
+
+        let option =
+            Convertion::convert_llama_dump_to_model::<Backend>(dump_path, model_name, &device);
+
+        match option {
+            Ok(_) => println!("ok"),
+            Err(e) => println!("error: {}", e),
+        }
+    }
+}
diff --git a/backend/rust/backend-burn/src/pkg/loader.rs b/backend/rust/backend-burn/src/pkg/loader.rs
@@ -2,9 +2,7 @@
 //! Adapted by Aisuko
 
 use core::f32;
-use npyz::NpyFile;
 use num_traits::cast::ToPrimitive;
-use std::io::Read;
 
 use burn::{
     nn,

diff --git a/backend/rust/backend-burn/src/pkg/mod.rs b/backend/rust/backend-burn/src/pkg/mod.rs
@@ -1,3 +1,7 @@
+pub mod convert;
 pub mod loader;
+pub mod tokenizer;
 
+pub use convert::*;
 pub use loader::*;
+pub use tokenizer::*;
diff --git a/backend/rust/backend-burn/src/pkg/tokenizer.rs b/backend/rust/backend-burn/src/pkg/tokenizer.rs
@@ -0,0 +1,91 @@
+//! LLM tokenization tools crate.
+//! Adapted from the https://github.com/Gadersd/llama2-burn/blob/main/src/token.rs
+//! Adapted by Aisuko
+
+use rust_tokenizers::{
+    error::TokenizerError,
+    tokenizer::{SentencePieceBpeTokenizer, Tokenizer, TruncationStrategy},
+    vocab::Vocab,
+};
+
+use std::{result, vec};
+
+const BOS_TOKEN_ID: i64 = 1;
+const EOS_TOKEN_ID: i64 = 2;
+
+pub type Result<T> = result::Result<T, TokenizerError>;
+
+pub struct LlamaTokenizer {
+    spm: SentencePieceBpeTokenizer,
+}
+
+impl LlamaTokenizer {
+    pub fn new(tokenizer_path: &str) -> Result<Self> {
+        let lower_case = false;
+        SentencePieceBpeTokenizer::from_file(tokenizer_path, lower_case).map(|spm| Self { spm })
+    }
+
+    pub fn encode(&self, text: &str, inlcude_bos: bool, include_eos: bool) -> Vec<i64> {
+        let pre = if inlcude_bos {
+            vec![BOS_TOKEN_ID]
+        } else {
+            vec![]
+        };
+
+        let post = if include_eos {
+            vec![EOS_TOKEN_ID]
+        } else {
+            vec![]
+        };
+
+        let token_ids = self
+            .spm
+            .encode(
+                text,
+                None,
+                std::usize::MAX,
+                &TruncationStrategy::LongestFirst,
+                0,
+            )
+            .token_ids;
+
+        [pre, token_ids, post]
+            .into_iter()
+            .flat_map(|v| v.into_iter())
+            .collect()
+    }
+
+    pub fn decode(&self, tokens: &[i64], skip_special_tokens: bool) -> String {
+        let clean_spaces = false;
+        self.spm.decode(tokens, skip_special_tokens, clean_spaces)
+    }
+
+    pub fn vocab_size(&self, include_special_tokens: bool) -> usize {
+        let vocab = self.spm.vocab();
+        if include_special_tokens {
+            vocab.values().len() + vocab.special_values().len()
+        } else {
+            vocab.values().len()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tokenizer() {
+        let home = std::env::var("HOME").unwrap();
+        let tm_path = &format!("{}/Downloads/workspace/llama/tokenizer.model", home);
+        let tokenizer = LlamaTokenizer::new(tm_path).unwrap();
+        // tokenizer.vocab_size(fale) should be >0
+        assert!(tokenizer.vocab_size(false) > 0);
+
+        let test_str = "Hello, I am Llama2!";
+        let encoded = tokenizer.encode(test_str, true, true);
+        let decoded = tokenizer.decode(&encoded, false);
+
+        assert_eq!(test_str, decoded);
+    }
+}