Skip to content

Commit

Permalink
fix: invalid utf-8 parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
acheong08 committed Nov 29, 2024
1 parent 72cc8fa commit 1b9baba
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 6 deletions.
9 changes: 5 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use base64;
use fancy_regex::Regex;
use mlua::prelude::*;
use rustc_hash::FxHashMap as HashMap;
Expand All @@ -6,7 +7,6 @@ use std::fs::File;
use std::io::{BufRead, BufReader};
use std::sync::{Arc, Mutex};
use std::thread;
use base64;

#[cfg(feature = "multithreading")]
const MAX_NUM_THREADS: usize = 128;
Expand Down Expand Up @@ -203,7 +203,7 @@ pub fn tiktoken_core(lua: &mlua::Lua) -> LuaResult<LuaTable> {
Ok(())
},
)?;
let _encode = lua.create_function(move |_, text: String| encode(&*state2, text))?;
let _encode = lua.create_function(move |_, text: mlua::String| encode(&*state2, text))?;

let exports = lua.create_table()?;
exports.set("new", _new)?;
Expand Down Expand Up @@ -261,7 +261,8 @@ fn new(
});
}

fn encode(state: &State, text: String) -> LuaResult<(Vec<usize>, usize, usize)> {
fn encode(state: &State, text: mlua::String) -> LuaResult<(Vec<usize>, usize, usize)> {
let encoded_str = String::from_utf8_lossy(text.as_bytes());
let allowed_special = HashSet::new();
let max_tokens = None;
Ok(state
Expand All @@ -270,7 +271,7 @@ fn encode(state: &State, text: String) -> LuaResult<(Vec<usize>, usize, usize)>
.unwrap()
.as_ref()
.unwrap()
._encode_native(&text, &allowed_special, max_tokens))
._encode_native(&encoded_str, &allowed_special, max_tokens))
}

pub struct CoreBPENative {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package = "tiktoken_core"
version = "0.2.2-1"
version = "0.2.3-1"

source = {
url = "git+https://github.com/gptlang/lua-tiktoken",
tag = "v0.2.2",
tag = "v0.2.3",
}

description = {
Expand Down

0 comments on commit 1b9baba

Please sign in to comment.