From be1c3acaf0e9d3317c3a250244235ed93ab34ad8 Mon Sep 17 00:00:00 2001
From: Huanghe <xs28@rice.edu>
Date: Thu, 1 Aug 2024 16:36:22 -0500
Subject: [PATCH] Make `Engine` FFI deepcopy friendly&improve vocab warnings

- `Engine` Now supports `__deepcopy__.`
- `Vocabulary` now warns against empty tokens and no longer produces false positive of creepy vocabulary
---
 Cargo.toml          |  2 +-
 pyproject.toml      |  2 +-
 src/ffi_bindings.rs |  7 ++++++-
 src/vocabulary.rs   | 11 ++++++++---
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 1a3a458..df2bb68 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,7 @@ keywords = [
 license = "MIT OR Apache-2.0"
 name = "kbnf"
 repository = "https://github.com/Dan-Wanna-M/kbnf"
-version = "0.3.0"
+version = "0.3.1"
 [lib]
 name = "kbnf"
 crate-type = ["cdylib", "rlib"]
diff --git a/pyproject.toml b/pyproject.toml
index 6878ed7..9e007fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ python-source = "python"
 features = ["python"]
 [project]
 name = "kbnf"
-version = "0.2.1"
+version = "0.2.2"
 dependencies = ["numpy"]
 requires-python = ">=3.7"
 classifiers = [
diff --git a/src/ffi_bindings.rs b/src/ffi_bindings.rs
index 2794982..f5e0cf2 100644
--- a/src/ffi_bindings.rs
+++ b/src/ffi_bindings.rs
@@ -1,10 +1,12 @@
 use crate::engine::CreateEngineError;
 use crate::engine_like::{AcceptTokenError, MaskLogitsError, UpdateLogitsError};
 use crate::vocabulary::{CreateVocabularyError, Vocabulary};
-use crate::{config, AcceptTokenResult, Config, Engine, EngineLike, Token};
+use crate::{AcceptTokenResult, Config, Engine, EngineLike, Token};
 #[cfg(feature = "python")]
 use pyo3::exceptions::PyValueError;
 #[cfg(feature = "python")]
+use pyo3::types::PyDict;
+#[cfg(feature = "python")]
 use pyo3::{pymethods, PyErr};
 #[cfg(feature = "wasm")]
 use wasm_bindgen::prelude::*;
@@ -641,6 +643,9 @@ impl Engine {
     fn __copy__(&self) -> Engine {
         self.clone()
     }
+    fn __deepcopy__(&self, _memo:pyo3::Bound<'_,PyDict>) -> Engine {
+        self.clone()
+    }
 }
 
 #[cfg(feature = "wasm")]
diff --git a/src/vocabulary.rs b/src/vocabulary.rs
index 592278e..8abcb1c 100644
--- a/src/vocabulary.rs
+++ b/src/vocabulary.rs
@@ -138,6 +138,11 @@ impl Vocabulary {
         let mut temp: [Vec<(u32, &Token)>; 256] = array::from_fn(|_| (vec![]));
         for (&token_id, token) in id_to_token.iter() {
             if token.0.is_empty() {
+                log::warn!(
+                    "Token ID {} corresponds to an empty token. 
+                    The token will be ignored. ",
+                    token_id
+                );
                 continue;
             }
             let first_byte = token.0[0];
@@ -183,7 +188,7 @@ impl Vocabulary {
             }
         }
         check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 32, 126);
-        if !not_existing_bytes.is_empty() {
+        if !not_existing_bytes.is_clear() {
             log::warn!(
                 "\
 The following printable ASCII characters are not used as the first byte of any token: {:?}. \
@@ -197,8 +202,8 @@ processing the vocab like the tokenizer.",
             );
         }
         not_existing_bytes.clear();
-        check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 127, 253); // 254 and 255 will not exist anyway
-        if !not_existing_bytes.is_empty() {
+        check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 194, 247);
+        if !not_existing_bytes.is_clear() {
             log::warn!(
                 "\
 The following UTF-8 bytes are not used as the first byte of any token: {:?}. \