Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] free speed/mem optimizations with ahash, dary_heap, and compact_str #1618

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pyo3 = { version = "0.21" }
numpy = "0.21"
ndarray = "0.15"
itertools = "0.12"
ahash = { version = "0.8.11", features = ["serde"] }

[dependencies.tokenizers]
path = "../../tokenizers"
Expand Down
54 changes: 54 additions & 0 deletions bindings/python/src/models.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
use std::collections::HashMap;
use std::hash::Hash;
use std::ops::{Deref, DerefMut};
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};

use crate::token::PyToken;
use crate::trainers::PyTrainer;
use ahash::AHashMap;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
Expand Down Expand Up @@ -31,6 +34,53 @@ pub struct PyModel {
pub model: Arc<RwLock<ModelWrapper>>,
}

// Newtype wrapper for AHashMap
#[derive(Clone, Debug)]
pub struct PyAHashMap<K, V>(pub AHashMap<K, V>);

impl<K, V> IntoPy<PyObject> for PyAHashMap<K, V>
where
K: IntoPy<PyObject> + Eq + Hash,
V: IntoPy<PyObject>,
{
fn into_py(self, py: Python<'_>) -> PyObject {
let dict = PyDict::new_bound(py);
for (k, v) in self.0 {
dict.set_item(k.into_py(py), v.into_py(py)).unwrap();
}
dict.into()
}
}

impl<'source, K, V> FromPyObject<'source> for PyAHashMap<K, V>
where
K: FromPyObject<'source> + Eq + Hash,
V: FromPyObject<'source>,
{
fn extract(ob: &'source PyAny) -> PyResult<Self> {
let dict = ob.downcast::<PyDict>()?;
let mut map = AHashMap::new();
for (k, v) in dict.iter() {
map.insert(K::extract(k)?, V::extract(v)?);
}
Ok(PyAHashMap(map))
}
}

impl<K, V> Deref for PyAHashMap<K, V> {
type Target = AHashMap<K, V>;

fn deref(&self) -> &Self::Target {
&self.0
}
}

impl<K, V> DerefMut for PyAHashMap<K, V> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

impl PyModel {
pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
let base = self.clone();
Expand Down Expand Up @@ -62,6 +112,10 @@ impl Model for PyModel {
self.model.read().unwrap().get_vocab()
}

fn get_vocab_ahash(&self) -> AHashMap<String, u32> {
self.model.read().unwrap().get_vocab_ahash()
}

fn get_vocab_size(&self) -> usize {
self.model.read().unwrap().get_vocab_size()
}
Expand Down
3 changes: 3 additions & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ fancy-regex = { version = "0.13", optional = true}
getrandom = { version = "0.2.10" }
esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
dary_heap = { version = "0.3.6", features = ["serde"] }
compact_str = { version = "0.8.0", features = ["serde"] }

[features]
default = ["progressbar", "onig", "esaxx_fast"]
Expand Down
24 changes: 17 additions & 7 deletions tokenizers/src/models/bpe/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@ use super::{super::OrderedVocabIter, trainer::BpeTrainer, Error, Pair, Word};
use crate::tokenizer::{Model, Result, Token};
use crate::utils::cache::{Cache, DEFAULT_CACHE_CAPACITY};
use crate::utils::iter::ResultShunt;
use ahash::AHashMap;
use serde_json::Value;
use std::borrow::Cow;

use std::collections::HashMap;
use std::{
collections::HashMap,
fs::File,
io::prelude::*,
io::{BufRead, BufReader},
path::{Path, PathBuf},
};

pub type Vocab = HashMap<String, u32>;
type VocabR = HashMap<u32, String>;
pub type MergeMap = HashMap<Pair, (u32, u32)>;
pub type Vocab = AHashMap<String, u32>;
type VocabR = AHashMap<u32, String>;
pub type MergeMap = AHashMap<Pair, (u32, u32)>;
pub type Merges = Vec<(String, String)>;

struct Config {
Expand All @@ -41,7 +43,7 @@ impl Default for BpeBuilder {
Self {
config: Config {
files: None,
vocab: HashMap::new(),
vocab: AHashMap::new(),
merges: vec![],
cache_capacity: DEFAULT_CACHE_CAPACITY,
dropout: None,
Expand Down Expand Up @@ -324,7 +326,7 @@ impl BPE {
let mut buffer = String::new();
vocab_file.read_to_string(&mut buffer)?;
let json: Value = serde_json::from_str(&buffer)?;
let mut vocab = HashMap::new();
let mut vocab = AHashMap::new();
match json {
Value::Object(m) => {
for (token, id) in m {
Expand Down Expand Up @@ -354,7 +356,11 @@ impl BPE {
}
}

pub fn get_vocab(&self) -> Vocab {
pub fn get_vocab(&self) -> HashMap<String, u32> {
self.vocab.clone().into_iter().collect()
}

pub fn get_vocab_ahash(&self) -> AHashMap<String, u32> {
self.vocab.clone()
}

Expand Down Expand Up @@ -481,6 +487,10 @@ impl Model for BPE {
type Trainer = BpeTrainer;

fn get_vocab(&self) -> HashMap<String, u32> {
self.vocab.clone().into_iter().collect()
}

fn get_vocab_ahash(&self) -> AHashMap<String, u32> {
self.vocab.clone()
}

Expand Down
4 changes: 2 additions & 2 deletions tokenizers/src/models/bpe/serialization.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use super::{super::OrderedVocabIter, convert_merges_to_hashmap, BpeBuilder, Pair, BPE};
use ahash::AHashMap;
use serde::{
de::{Error, MapAccess, Visitor},
ser::SerializeStruct,
Deserialize, Deserializer, Serialize, Serializer,
};
use std::collections::HashMap;

impl Serialize for BPE {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
Expand Down Expand Up @@ -80,7 +80,7 @@ impl<'de> Visitor<'de> for BPEVisitor {
V: MapAccess<'de>,
{
let mut builder = BpeBuilder::new();
let mut vocab: Option<HashMap<String, u32>> = None;
let mut vocab: Option<AHashMap<String, u32>> = None;

#[derive(Debug, Deserialize)]
#[serde(untagged)]
Expand Down
Loading