From 53dd8066d62c78d513239838459d9406c5ecd211 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 4 Dec 2024 12:55:13 +0100 Subject: [PATCH 1/7] Add tokenizers support. --- yomikomi/Cargo.toml | 1 + yomikomi/src/error.rs | 3 ++ yomikomi/src/tokenize.rs | 60 +++++++++++++++++++++++++++++++++------- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/yomikomi/Cargo.toml b/yomikomi/Cargo.toml index cbb6860..53619f0 100644 --- a/yomikomi/Cargo.toml +++ b/yomikomi/Cargo.toml @@ -15,4 +15,5 @@ sentencepiece = "0.11.2" serde_json = "1.0.108" symphonia = { version = "0.5.3", features = ["all-codecs"] } thiserror = "1.0.50" +tokenizers = "0.21.0" zstd = "0.13.0" diff --git a/yomikomi/src/error.rs b/yomikomi/src/error.rs index 2e8de76..caa03d5 100644 --- a/yomikomi/src/error.rs +++ b/yomikomi/src/error.rs @@ -46,6 +46,9 @@ pub enum Error { #[error(transparent)] Io(#[from] std::io::Error), + #[error(transparent)] + Tokenizers(#[from] tokenizers::tokenizer::Error), + /// Arbitrary errors wrapping. #[error(transparent)] Wrapped(Box), diff --git a/yomikomi/src/tokenize.rs b/yomikomi/src/tokenize.rs index 62e313c..1ae1951 100644 --- a/yomikomi/src/tokenize.rs +++ b/yomikomi/src/tokenize.rs @@ -1,9 +1,43 @@ -use crate::{Array, Error, Result, Stream}; +use crate::{Array, Error as E, Result, Stream}; use sentencepiece::SentencePieceProcessor; use std::sync::{Arc, Mutex}; +use tokenizers::tokenizer::Tokenizer; + +enum Processor { + Tokenizers { inner: Tokenizer, bos_id: Option, eos_id: Option }, + SentencePiece(SentencePieceProcessor), +} + +impl Processor { + fn bos_id(&self) -> Option { + match self { + Self::SentencePiece(p) => p.bos_id(), + Self::Tokenizers { inner: _, bos_id, eos_id: _ } => bos_id.as_ref().copied(), + } + } + + fn eos_id(&self) -> Option { + match self { + Self::SentencePiece(p) => p.eos_id(), + Self::Tokenizers { inner: _, bos_id: _, eos_id } => eos_id.as_ref().copied(), + } + } + + fn encode(&self, str: &str) -> Result> { + let tokens: Vec<_> = match self { + Self::SentencePiece(p) => { + p.encode(str).map_err(E::wrap)?.iter().map(|v| v.id).collect() + } + Self::Tokenizers { inner, bos_id: _, eos_id: _ } => { + inner.encode(str, false)?.get_ids().to_vec() + } + }; + Ok(tokens) + } +} pub struct Tokenize { - spp: Arc, + processor: Arc, input: T, in_key: String, out_key: String, @@ -23,14 +57,20 @@ impl Tokenize { include_bos: bool, include_eos: bool, ) -> Result { - let spp = SentencePieceProcessor::open(path).map_err(Error::wrap)?; - let nl_id = match spp.encode("\n").map_err(Error::wrap)?.last() { + let path = path.as_ref(); + let processor = if path.extension().map_or(false, |v| v == "json") { + let inner = Tokenizer::from_file(path)?; + Processor::Tokenizers { inner, bos_id: None, eos_id: None } + } else { + Processor::SentencePiece(SentencePieceProcessor::open(path).map_err(E::wrap)?) + }; + let nl_id = match processor.encode("\n").map_err(E::wrap)?.last() { None => crate::bail!("no specific token id for newline"), - Some(p) => p.id, + Some(p) => *p, }; let tokens_and_chars = if report_bpb { Some(Mutex::new((0, 0))) } else { None }; Ok(Self { - spp: Arc::new(spp), + processor: Arc::new(processor), input, in_key, out_key, @@ -62,7 +102,7 @@ impl Stream for Tokenize { let text = String::from_utf8_lossy(values); let mut all_tokens = Vec::new(); if self.include_bos { - if let Some(bos_id) = self.spp.bos_id() { + if let Some(bos_id) = self.processor.bos_id() { all_tokens.push(bos_id) } } @@ -72,7 +112,7 @@ impl Stream for Tokenize { if idx > 0 { all_tokens.push(self.nl_id) } - let tokens = match self.spp.encode(text) { + let tokens = match self.processor.encode(text) { Ok(tokens) => tokens, Err(err) => { eprintln!("tokenizer encode error {err:?}"); @@ -86,11 +126,11 @@ impl Stream for Tokenize { bpb = Some(tokens_and_chars.0 as f64 / tokens_and_chars.1 as f64 / f64::ln(2.)) }; for token in tokens { - all_tokens.push(token.id) + all_tokens.push(token) } } if self.include_eos { - if let Some(eos_id) = self.spp.eos_id() { + if let Some(eos_id) = self.processor.eos_id() { all_tokens.push(eos_id) } } From 87ed1e8945e8dc6c6e9bd6bd213afa149ab950b6 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 4 Dec 2024 13:15:46 +0100 Subject: [PATCH 2/7] Clippy fixes. --- yomikomi/src/strided_index.rs | 2 +- yomikomi/src/tokenize.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/yomikomi/src/strided_index.rs b/yomikomi/src/strided_index.rs index d7ae08b..1aff318 100644 --- a/yomikomi/src/strided_index.rs +++ b/yomikomi/src/strided_index.rs @@ -27,7 +27,7 @@ impl<'a> StridedIndex<'a> { } } -impl<'a> Iterator for StridedIndex<'a> { +impl Iterator for StridedIndex<'_> { type Item = usize; fn next(&mut self) -> Option { diff --git a/yomikomi/src/tokenize.rs b/yomikomi/src/tokenize.rs index 1ae1951..069392f 100644 --- a/yomikomi/src/tokenize.rs +++ b/yomikomi/src/tokenize.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, Mutex}; use tokenizers::tokenizer::Tokenizer; enum Processor { - Tokenizers { inner: Tokenizer, bos_id: Option, eos_id: Option }, + Tokenizers { inner: Box, bos_id: Option, eos_id: Option }, SentencePiece(SentencePieceProcessor), } @@ -59,7 +59,7 @@ impl Tokenize { ) -> Result { let path = path.as_ref(); let processor = if path.extension().map_or(false, |v| v == "json") { - let inner = Tokenizer::from_file(path)?; + let inner = Box::new(Tokenizer::from_file(path)?); Processor::Tokenizers { inner, bos_id: None, eos_id: None } } else { Processor::SentencePiece(SentencePieceProcessor::open(path).map_err(E::wrap)?) From 58b9fe9d037f173f90b89881a9568c02024c4737 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 4 Dec 2024 13:23:23 +0100 Subject: [PATCH 3/7] Better handling of bos/eos. --- yomikomi-pyo3/src/lib.rs | 11 ++++++++++- yomikomi/src/tokenize.rs | 25 ++++++++++++++++--------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/yomikomi-pyo3/src/lib.rs b/yomikomi-pyo3/src/lib.rs index f3d740c..59d5f57 100644 --- a/yomikomi-pyo3/src/lib.rs +++ b/yomikomi-pyo3/src/lib.rs @@ -214,6 +214,8 @@ struct Tokenize { report_bpb: bool, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, } impl Iterable for Tokenize { @@ -227,6 +229,8 @@ impl Iterable for Tokenize { self.report_bpb, self.include_bos, self.include_eos, + self.bos_id, + self.eos_id, ) .map_err(w)?; Ok(StreamIter { stream: Box::new(stream) }) @@ -409,7 +413,8 @@ impl YkIterable { /// Loads a sentencepiece tokenizer, and use it to tokenize the field passed as an argument of /// this function. - #[pyo3(signature = (path, *, in_field="text".to_string(), out_field=None, report_bpb=true, include_bos=true, include_eos=false))] + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = (path, *, in_field="text".to_string(), out_field=None, report_bpb=true, include_bos=true, include_eos=false, bos_id=None, eos_id=None))] fn tokenize( &self, path: std::path::PathBuf, @@ -418,6 +423,8 @@ impl YkIterable { report_bpb: bool, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, ) -> PyResult { let out_field = out_field.unwrap_or_else(|| in_field.clone()); let inner = Tokenize { @@ -428,6 +435,8 @@ impl YkIterable { report_bpb, include_bos, include_eos, + bos_id, + eos_id, }; Ok(Self { inner: Arc::new(inner) }) } diff --git a/yomikomi/src/tokenize.rs b/yomikomi/src/tokenize.rs index 069392f..9d9ccc9 100644 --- a/yomikomi/src/tokenize.rs +++ b/yomikomi/src/tokenize.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, Mutex}; use tokenizers::tokenizer::Tokenizer; enum Processor { - Tokenizers { inner: Box, bos_id: Option, eos_id: Option }, + Tokenizers(Box), SentencePiece(SentencePieceProcessor), } @@ -12,14 +12,14 @@ impl Processor { fn bos_id(&self) -> Option { match self { Self::SentencePiece(p) => p.bos_id(), - Self::Tokenizers { inner: _, bos_id, eos_id: _ } => bos_id.as_ref().copied(), + Self::Tokenizers(_) => None, } } fn eos_id(&self) -> Option { match self { Self::SentencePiece(p) => p.eos_id(), - Self::Tokenizers { inner: _, bos_id: _, eos_id } => eos_id.as_ref().copied(), + Self::Tokenizers(_) => None, } } @@ -28,9 +28,7 @@ impl Processor { Self::SentencePiece(p) => { p.encode(str).map_err(E::wrap)?.iter().map(|v| v.id).collect() } - Self::Tokenizers { inner, bos_id: _, eos_id: _ } => { - inner.encode(str, false)?.get_ids().to_vec() - } + Self::Tokenizers(p) => p.encode(str, false)?.get_ids().to_vec(), }; Ok(tokens) } @@ -45,9 +43,12 @@ pub struct Tokenize { tokens_and_chars: Option>, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, } impl Tokenize { + #[allow(clippy::too_many_arguments)] pub fn new>( path: P, input: T, @@ -56,11 +57,13 @@ impl Tokenize { report_bpb: bool, include_bos: bool, include_eos: bool, + bos_id: Option, + eos_id: Option, ) -> Result { let path = path.as_ref(); let processor = if path.extension().map_or(false, |v| v == "json") { let inner = Box::new(Tokenizer::from_file(path)?); - Processor::Tokenizers { inner, bos_id: None, eos_id: None } + Processor::Tokenizers(inner) } else { Processor::SentencePiece(SentencePieceProcessor::open(path).map_err(E::wrap)?) }; @@ -78,6 +81,8 @@ impl Tokenize { tokens_and_chars, include_bos, include_eos, + bos_id, + eos_id, }) } } @@ -102,7 +107,8 @@ impl Stream for Tokenize { let text = String::from_utf8_lossy(values); let mut all_tokens = Vec::new(); if self.include_bos { - if let Some(bos_id) = self.processor.bos_id() { + let bos_id = self.bos_id.or_else(|| self.processor.bos_id()); + if let Some(bos_id) = bos_id { all_tokens.push(bos_id) } } @@ -130,7 +136,8 @@ impl Stream for Tokenize { } } if self.include_eos { - if let Some(eos_id) = self.processor.eos_id() { + let eos_id = self.eos_id.or_else(|| self.processor.eos_id()); + if let Some(eos_id) = eos_id { all_tokens.push(eos_id) } } From cfd7b75ab2b629678aa49c2d43e8b35093ca8d6b Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 4 Dec 2024 13:28:12 +0100 Subject: [PATCH 4/7] Fix the CI. --- .github/workflows/ykpy-ci.yml | 2 +- Cargo.toml | 2 +- yomikomi-pyo3/py_src/yomikomi/__init__.pyi | 13 ++++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ykpy-ci.yml b/.github/workflows/ykpy-ci.yml index c905d16..2edce21 100644 --- a/.github/workflows/ykpy-ci.yml +++ b/.github/workflows/ykpy-ci.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [x86_64, x86, aarch64, armv7, s390x, ppc64le] + target: [x86_64, x86, aarch64, armv7] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/Cargo.toml b/Cargo.toml index 87312dd..b0b6bea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.2.0" +version = "0.3.0" edition = "2021" description = "Dataloader for training large text models." repository = "https://github.com/kyutai-labs/yomikomi" diff --git a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi index 19792a9..c3dfff8 100644 --- a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi +++ b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi @@ -103,7 +103,18 @@ class YkIterable: """ """ pass - def tokenize(self, path, *, in_field=..., out_field=None, report_bpb=True, include_bos=True, include_eos=False): + def tokenize( + self, + path, + *, + in_field=..., + out_field=None, + report_bpb=True, + include_bos=True, + include_eos=False, + bos_id=None, + eos_id=None + ): """ Loads a sentencepiece tokenizer, and use it to tokenize the field passed as an argument of this function. From 098a7f11ccabf9cb1c36ce7c830015801594bc82 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 4 Dec 2024 13:48:44 +0100 Subject: [PATCH 5/7] Also bump the dependency version. --- yomikomi-pyo3/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yomikomi-pyo3/Cargo.toml b/yomikomi-pyo3/Cargo.toml index 0311dbb..4ff11c7 100644 --- a/yomikomi-pyo3/Cargo.toml +++ b/yomikomi-pyo3/Cargo.toml @@ -15,4 +15,4 @@ crate-type = ["cdylib"] [dependencies] numpy = "0.22.0" pyo3 = "0.22.0" -yomikomi = { path = "../yomikomi", version = "0.2.0" } +yomikomi = { path = "../yomikomi", version = "0.3.0" } From a17086c8301e3c4b34e07ec6edaaa6343b91b95f Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 4 Dec 2024 13:52:51 +0100 Subject: [PATCH 6/7] Another CI tweak. --- .github/workflows/ykpy-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ykpy-ci.yml b/.github/workflows/ykpy-ci.yml index 2edce21..96ef8ac 100644 --- a/.github/workflows/ykpy-ci.yml +++ b/.github/workflows/ykpy-ci.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [x86_64, x86, aarch64, armv7] + target: [x86_64, x86, armv7] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 @@ -69,7 +69,7 @@ jobs: runs-on: macos-latest strategy: matrix: - target: [x86_64, aarch64] + target: [x86_64] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 From d96c898ac2559c2f102bdcfe936f6f43b770ad02 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 4 Dec 2024 13:57:46 +0100 Subject: [PATCH 7/7] Ok let's be x86 only for now... --- .github/workflows/ykpy-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ykpy-ci.yml b/.github/workflows/ykpy-ci.yml index 96ef8ac..6dc4f60 100644 --- a/.github/workflows/ykpy-ci.yml +++ b/.github/workflows/ykpy-ci.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [x86_64, x86, armv7] + target: [x86_64, x86] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4