Make Engine FFI deepcopy friendly&improve vocab warnings

- `Engine` Now supports `__deepcopy__.` - `Vocabulary` now warns against empty tokens and no longer produces false positive of creepy vocabulary
Dan-wanna-M · Aug 1, 2024 · be1c3ac · be1c3ac
1 parent 3ad52a3
commit be1c3ac
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 6 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ keywords = [
 license = "MIT OR Apache-2.0"
 name = "kbnf"
 repository = "https://github.com/Dan-Wanna-M/kbnf"
-version = "0.3.0"
+version = "0.3.1"
 [lib]
 name = "kbnf"
 crate-type = ["cdylib", "rlib"]

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ python-source = "python"
 features = ["python"]
 [project]
 name = "kbnf"
-version = "0.2.1"
+version = "0.2.2"
 dependencies = ["numpy"]
 requires-python = ">=3.7"
 classifiers = [

diff --git a/src/ffi_bindings.rs b/src/ffi_bindings.rs
@@ -1,10 +1,12 @@
 use crate::engine::CreateEngineError;
 use crate::engine_like::{AcceptTokenError, MaskLogitsError, UpdateLogitsError};
 use crate::vocabulary::{CreateVocabularyError, Vocabulary};
-use crate::{config, AcceptTokenResult, Config, Engine, EngineLike, Token};
+use crate::{AcceptTokenResult, Config, Engine, EngineLike, Token};
 #[cfg(feature = "python")]
 use pyo3::exceptions::PyValueError;
 #[cfg(feature = "python")]
+use pyo3::types::PyDict;
+#[cfg(feature = "python")]
 use pyo3::{pymethods, PyErr};
 #[cfg(feature = "wasm")]
 use wasm_bindgen::prelude::*;
@@ -641,6 +643,9 @@ impl Engine {
     fn __copy__(&self) -> Engine {
         self.clone()
     }
+    fn __deepcopy__(&self, _memo:pyo3::Bound<'_,PyDict>) -> Engine {
+        self.clone()
+    }
 }
 
 #[cfg(feature = "wasm")]

diff --git a/src/vocabulary.rs b/src/vocabulary.rs
@@ -138,6 +138,11 @@ impl Vocabulary {
         let mut temp: [Vec<(u32, &Token)>; 256] = array::from_fn(|_| (vec![]));
         for (&token_id, token) in id_to_token.iter() {
             if token.0.is_empty() {
+                log::warn!(
+                    "Token ID {} corresponds to an empty token. 
+                    The token will be ignored. ",
+                    token_id
+                );
                 continue;
             }
             let first_byte = token.0[0];
@@ -183,7 +188,7 @@ impl Vocabulary {
             }
         }
         check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 32, 126);
-        if !not_existing_bytes.is_empty() {
+        if !not_existing_bytes.is_clear() {
             log::warn!(
                 "\
 The following printable ASCII characters are not used as the first byte of any token: {:?}. \
@@ -197,8 +202,8 @@ processing the vocab like the tokenizer.",
             );
         }
         not_existing_bytes.clear();
-        check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 127, 253); // 254 and 255 will not exist anyway
-        if !not_existing_bytes.is_empty() {
+        check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 194, 247);
+        if !not_existing_bytes.is_clear() {
             log::warn!(
                 "\
 The following UTF-8 bytes are not used as the first byte of any token: {:?}. \