diff --git a/.github/workflows/ykpy-ci.yml b/.github/workflows/ykpy-ci.yml index c905d16..6dc4f60 100644 --- a/.github/workflows/ykpy-ci.yml +++ b/.github/workflows/ykpy-ci.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [x86_64, x86, aarch64, armv7, s390x, ppc64le] + target: [x86_64, x86] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 @@ -69,7 +69,7 @@ jobs: runs-on: macos-latest strategy: matrix: - target: [x86_64, aarch64] + target: [x86_64] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/Cargo.toml b/Cargo.toml index 87312dd..b0b6bea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.2.0" +version = "0.3.0" edition = "2021" description = "Dataloader for training large text models." repository = "https://github.com/kyutai-labs/yomikomi" diff --git a/yomikomi-pyo3/Cargo.toml b/yomikomi-pyo3/Cargo.toml index 0311dbb..4ff11c7 100644 --- a/yomikomi-pyo3/Cargo.toml +++ b/yomikomi-pyo3/Cargo.toml @@ -15,4 +15,4 @@ crate-type = ["cdylib"] [dependencies] numpy = "0.22.0" pyo3 = "0.22.0" -yomikomi = { path = "../yomikomi", version = "0.2.0" } +yomikomi = { path = "../yomikomi", version = "0.3.0" } diff --git a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi index 19792a9..c3dfff8 100644 --- a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi +++ b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi @@ -103,7 +103,18 @@ class YkIterable: """ """ pass - def tokenize(self, path, *, in_field=..., out_field=None, report_bpb=True, include_bos=True, include_eos=False): + def tokenize( + self, + path, + *, + in_field=..., + out_field=None, + report_bpb=True, + include_bos=True, + include_eos=False, + bos_id=None, + eos_id=None + ): """ Loads a sentencepiece tokenizer, and use it to tokenize the field passed as an argument of this function. diff --git a/yomikomi-pyo3/src/lib.rs b/yomikomi-pyo3/src/lib.rs index f3d740c..59d5f57 100644 --- a/yomikomi-pyo3/src/lib.rs +++ b/yomikomi-pyo3/src/lib.rs @@ -214,6 +214,8 @@ struct Tokenize { report_bpb: bool, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, } impl Iterable for Tokenize { @@ -227,6 +229,8 @@ impl Iterable for Tokenize { self.report_bpb, self.include_bos, self.include_eos, + self.bos_id, + self.eos_id, ) .map_err(w)?; Ok(StreamIter { stream: Box::new(stream) }) @@ -409,7 +413,8 @@ impl YkIterable { /// Loads a sentencepiece tokenizer, and use it to tokenize the field passed as an argument of /// this function. - #[pyo3(signature = (path, *, in_field="text".to_string(), out_field=None, report_bpb=true, include_bos=true, include_eos=false))] + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = (path, *, in_field="text".to_string(), out_field=None, report_bpb=true, include_bos=true, include_eos=false, bos_id=None, eos_id=None))] fn tokenize( &self, path: std::path::PathBuf, @@ -418,6 +423,8 @@ impl YkIterable { report_bpb: bool, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, ) -> PyResult { let out_field = out_field.unwrap_or_else(|| in_field.clone()); let inner = Tokenize { @@ -428,6 +435,8 @@ impl YkIterable { report_bpb, include_bos, include_eos, + bos_id, + eos_id, }; Ok(Self { inner: Arc::new(inner) }) } diff --git a/yomikomi/Cargo.toml b/yomikomi/Cargo.toml index cbb6860..53619f0 100644 --- a/yomikomi/Cargo.toml +++ b/yomikomi/Cargo.toml @@ -15,4 +15,5 @@ sentencepiece = "0.11.2" serde_json = "1.0.108" symphonia = { version = "0.5.3", features = ["all-codecs"] } thiserror = "1.0.50" +tokenizers = "0.21.0" zstd = "0.13.0" diff --git a/yomikomi/src/error.rs b/yomikomi/src/error.rs index 2e8de76..caa03d5 100644 --- a/yomikomi/src/error.rs +++ b/yomikomi/src/error.rs @@ -46,6 +46,9 @@ pub enum Error { #[error(transparent)] Io(#[from] std::io::Error), + #[error(transparent)] + Tokenizers(#[from] tokenizers::tokenizer::Error), + /// Arbitrary errors wrapping. #[error(transparent)] Wrapped(Box), diff --git a/yomikomi/src/strided_index.rs b/yomikomi/src/strided_index.rs index d7ae08b..1aff318 100644 --- a/yomikomi/src/strided_index.rs +++ b/yomikomi/src/strided_index.rs @@ -27,7 +27,7 @@ impl<'a> StridedIndex<'a> { } } -impl<'a> Iterator for StridedIndex<'a> { +impl Iterator for StridedIndex<'_> { type Item = usize; fn next(&mut self) -> Option { diff --git a/yomikomi/src/tokenize.rs b/yomikomi/src/tokenize.rs index 62e313c..9d9ccc9 100644 --- a/yomikomi/src/tokenize.rs +++ b/yomikomi/src/tokenize.rs @@ -1,9 +1,41 @@ -use crate::{Array, Error, Result, Stream}; +use crate::{Array, Error as E, Result, Stream}; use sentencepiece::SentencePieceProcessor; use std::sync::{Arc, Mutex}; +use tokenizers::tokenizer::Tokenizer; + +enum Processor { + Tokenizers(Box), + SentencePiece(SentencePieceProcessor), +} + +impl Processor { + fn bos_id(&self) -> Option { + match self { + Self::SentencePiece(p) => p.bos_id(), + Self::Tokenizers(_) => None, + } + } + + fn eos_id(&self) -> Option { + match self { + Self::SentencePiece(p) => p.eos_id(), + Self::Tokenizers(_) => None, + } + } + + fn encode(&self, str: &str) -> Result> { + let tokens: Vec<_> = match self { + Self::SentencePiece(p) => { + p.encode(str).map_err(E::wrap)?.iter().map(|v| v.id).collect() + } + Self::Tokenizers(p) => p.encode(str, false)?.get_ids().to_vec(), + }; + Ok(tokens) + } +} pub struct Tokenize { - spp: Arc, + processor: Arc, input: T, in_key: String, out_key: String, @@ -11,9 +43,12 @@ pub struct Tokenize { tokens_and_chars: Option>, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, } impl Tokenize { + #[allow(clippy::too_many_arguments)] pub fn new>( path: P, input: T, @@ -22,15 +57,23 @@ impl Tokenize { report_bpb: bool, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, ) -> Result { - let spp = SentencePieceProcessor::open(path).map_err(Error::wrap)?; - let nl_id = match spp.encode("\n").map_err(Error::wrap)?.last() { + let path = path.as_ref(); + let processor = if path.extension().map_or(false, |v| v == "json") { + let inner = Box::new(Tokenizer::from_file(path)?); + Processor::Tokenizers(inner) + } else { + Processor::SentencePiece(SentencePieceProcessor::open(path).map_err(E::wrap)?) + }; + let nl_id = match processor.encode("\n").map_err(E::wrap)?.last() { None => crate::bail!("no specific token id for newline"), - Some(p) => p.id, + Some(p) => *p, }; let tokens_and_chars = if report_bpb { Some(Mutex::new((0, 0))) } else { None }; Ok(Self { - spp: Arc::new(spp), + processor: Arc::new(processor), input, in_key, out_key, @@ -38,6 +81,8 @@ impl Tokenize { tokens_and_chars, include_bos, include_eos, + bos_id, + eos_id, }) } } @@ -62,7 +107,8 @@ impl Stream for Tokenize { let text = String::from_utf8_lossy(values); let mut all_tokens = Vec::new(); if self.include_bos { - if let Some(bos_id) = self.spp.bos_id() { + let bos_id = self.bos_id.or_else(|| self.processor.bos_id()); + if let Some(bos_id) = bos_id { all_tokens.push(bos_id) } } @@ -72,7 +118,7 @@ impl Stream for Tokenize { if idx > 0 { all_tokens.push(self.nl_id) } - let tokens = match self.spp.encode(text) { + let tokens = match self.processor.encode(text) { Ok(tokens) => tokens, Err(err) => { eprintln!("tokenizer encode error {err:?}"); @@ -86,11 +132,12 @@ impl Stream for Tokenize { bpb = Some(tokens_and_chars.0 as f64 / tokens_and_chars.1 as f64 / f64::ln(2.)) }; for token in tokens { - all_tokens.push(token.id) + all_tokens.push(token) } } if self.include_eos { - if let Some(eos_id) = self.spp.eos_id() { + let eos_id = self.eos_id.or_else(|| self.processor.eos_id()); + if let Some(eos_id) = eos_id { all_tokens.push(eos_id) } }