Skip to content

Commit

Permalink
Make Engine FFI deepcopy friendly&improve vocab warnings
Browse files Browse the repository at this point in the history
- `Engine` Now supports `__deepcopy__.`
- `Vocabulary` now warns against empty tokens and no longer produces false positive of creepy vocabulary
  • Loading branch information
Dan-wanna-M committed Aug 1, 2024
1 parent 3ad52a3 commit be1c3ac
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ keywords = [
license = "MIT OR Apache-2.0"
name = "kbnf"
repository = "https://github.com/Dan-Wanna-M/kbnf"
version = "0.3.0"
version = "0.3.1"
[lib]
name = "kbnf"
crate-type = ["cdylib", "rlib"]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ python-source = "python"
features = ["python"]
[project]
name = "kbnf"
version = "0.2.1"
version = "0.2.2"
dependencies = ["numpy"]
requires-python = ">=3.7"
classifiers = [
Expand Down
7 changes: 6 additions & 1 deletion src/ffi_bindings.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
use crate::engine::CreateEngineError;
use crate::engine_like::{AcceptTokenError, MaskLogitsError, UpdateLogitsError};
use crate::vocabulary::{CreateVocabularyError, Vocabulary};
use crate::{config, AcceptTokenResult, Config, Engine, EngineLike, Token};
use crate::{AcceptTokenResult, Config, Engine, EngineLike, Token};
#[cfg(feature = "python")]
use pyo3::exceptions::PyValueError;
#[cfg(feature = "python")]
use pyo3::types::PyDict;
#[cfg(feature = "python")]
use pyo3::{pymethods, PyErr};
#[cfg(feature = "wasm")]
use wasm_bindgen::prelude::*;
Expand Down Expand Up @@ -641,6 +643,9 @@ impl Engine {
fn __copy__(&self) -> Engine {
self.clone()
}
fn __deepcopy__(&self, _memo:pyo3::Bound<'_,PyDict>) -> Engine {
self.clone()
}
}

#[cfg(feature = "wasm")]
Expand Down
11 changes: 8 additions & 3 deletions src/vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ impl Vocabulary {
let mut temp: [Vec<(u32, &Token)>; 256] = array::from_fn(|_| (vec![]));
for (&token_id, token) in id_to_token.iter() {
if token.0.is_empty() {
log::warn!(
"Token ID {} corresponds to an empty token.
The token will be ignored. ",
token_id
);
continue;
}
let first_byte = token.0[0];
Expand Down Expand Up @@ -183,7 +188,7 @@ impl Vocabulary {
}
}
check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 32, 126);
if !not_existing_bytes.is_empty() {
if !not_existing_bytes.is_clear() {
log::warn!(
"\
The following printable ASCII characters are not used as the first byte of any token: {:?}. \
Expand All @@ -197,8 +202,8 @@ processing the vocab like the tokenizer.",
);
}
not_existing_bytes.clear();
check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 127, 253); // 254 and 255 will not exist anyway
if !not_existing_bytes.is_empty() {
check_non_existing_byte_in_range(first_bytes, &mut not_existing_bytes, 194, 247);
if !not_existing_bytes.is_clear() {
log::warn!(
"\
The following UTF-8 bytes are not used as the first byte of any token: {:?}. \
Expand Down

0 comments on commit be1c3ac

Please sign in to comment.