From be1c3acaf0e9d3317c3a250244235ed93ab34ad8 Mon Sep 17 00:00:00 2001 From: Huanghe Date: Thu, 1 Aug 2024 16:36:22 -0500 Subject: [PATCH] Make `Engine` FFI deepcopy friendly&improve vocab warnings - `Engine` Now supports `__deepcopy__.` - `Vocabulary` now warns against empty tokens and no longer produces false positive of creepy vocabulary --- Cargo.toml | 2 +- pyproject.toml | 2 +- src/ffi_bindings.rs | 7 ++++++- src/vocabulary.rs | 11 ++++++++--- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1a3a458..df2bb68 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ keywords = [ license = "MIT OR Apache-2.0" name = "kbnf" repository = "https://github.com/Dan-Wanna-M/kbnf" -version = "0.3.0" +version = "0.3.1" [lib] name = "kbnf" crate-type = ["cdylib", "rlib"] diff --git a/pyproject.toml b/pyproject.toml index 6878ed7..9e007fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ python-source = "python" features = ["python"] [project] name = "kbnf" -version = "0.2.1" +version = "0.2.2" dependencies = ["numpy"] requires-python = ">=3.7" classifiers = [ diff --git a/src/ffi_bindings.rs b/src/ffi_bindings.rs index 2794982..f5e0cf2 100644 --- a/src/ffi_bindings.rs +++ b/src/ffi_bindings.rs @@ -1,10 +1,12 @@ use crate::engine::CreateEngineError; use crate::engine_like::{AcceptTokenError, MaskLogitsError, UpdateLogitsError}; use crate::vocabulary::{CreateVocabularyError, Vocabulary}; -use crate::{config, AcceptTokenResult, Config, Engine, EngineLike, Token}; +use crate::{AcceptTokenResult, Config, Engine, EngineLike, Token}; #[cfg(feature = "python")] use pyo3::exceptions::PyValueError; #[cfg(feature = "python")] +use pyo3::types::PyDict; +#[cfg(feature = "python")] use pyo3::{pymethods, PyErr}; #[cfg(feature = "wasm")] use wasm_bindgen::prelude::*; @@ -641,6 +643,9 @@ impl Engine { fn __copy__(&self) -> Engine { self.clone() } + fn __deepcopy__(&self, _memo:pyo3::Bound<'_,PyDict>) -> Engine { + self.clone() + } } #[cfg(feature = "wasm")] diff --git a/src/vocabulary.rs b/src/vocabulary.rs index 592278e..8abcb1c 100644 --- a/src/vocabulary.rs +++ b/src/vocabulary.rs @@ -138,6 +138,11 @@ impl Vocabulary { let mut temp: [Vec<(u32, &Token)>; 256] = array::from_fn(|_| (vec![])); for (&token_id, token) in id_to_token.iter() { if token.0.is_empty() { + log::warn!( + "Token ID {} corresponds to an empty token. + The token will be ignored. ", + token_id + ); continue; } let first_byte = token.0[0]; @@ -183,7 +188,7 @@ impl Vocabulary { } } check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 32, 126); - if !not_existing_bytes.is_empty() { + if !not_existing_bytes.is_clear() { log::warn!( "\ The following printable ASCII characters are not used as the first byte of any token: {:?}. \ @@ -197,8 +202,8 @@ processing the vocab like the tokenizer.", ); } not_existing_bytes.clear(); - check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 127, 253); // 254 and 255 will not exist anyway - if !not_existing_bytes.is_empty() { + check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 194, 247); + if !not_existing_bytes.is_clear() { log::warn!( "\ The following UTF-8 bytes are not used as the first byte of any token: {:?}. \