diff --git a/.github/workflows/ykpy-ci.yml b/.github/workflows/ykpy-ci.yml
index c905d16..6dc4f60 100644
--- a/.github/workflows/ykpy-ci.yml
+++ b/.github/workflows/ykpy-ci.yml
@@ -23,7 +23,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        target: [x86_64, x86, aarch64, armv7, s390x, ppc64le]
+        target: [x86_64, x86]
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
@@ -69,7 +69,7 @@ jobs:
     runs-on: macos-latest
     strategy:
       matrix:
-        target: [x86_64, aarch64]
+        target: [x86_64]
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
diff --git a/Cargo.toml b/Cargo.toml
index 87312dd..b0b6bea 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.2.0"
+version = "0.3.0"
 edition = "2021"
 description = "Dataloader for training large text models."
 repository = "https://github.com/kyutai-labs/yomikomi"
diff --git a/yomikomi-pyo3/Cargo.toml b/yomikomi-pyo3/Cargo.toml
index 0311dbb..4ff11c7 100644
--- a/yomikomi-pyo3/Cargo.toml
+++ b/yomikomi-pyo3/Cargo.toml
@@ -15,4 +15,4 @@ crate-type = ["cdylib"]
 [dependencies]
 numpy = "0.22.0"
 pyo3 = "0.22.0"
-yomikomi = { path = "../yomikomi", version = "0.2.0" }
+yomikomi = { path = "../yomikomi", version = "0.3.0" }
diff --git a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi
index 19792a9..c3dfff8 100644
--- a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi
+++ b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi
@@ -103,7 +103,18 @@ class YkIterable:
         """ """
         pass
 
-    def tokenize(self, path, *, in_field=..., out_field=None, report_bpb=True, include_bos=True, include_eos=False):
+    def tokenize(
+        self,
+        path,
+        *,
+        in_field=...,
+        out_field=None,
+        report_bpb=True,
+        include_bos=True,
+        include_eos=False,
+        bos_id=None,
+        eos_id=None
+    ):
         """
         Loads a sentencepiece tokenizer, and use it to tokenize the field passed as an argument of
         this function.
diff --git a/yomikomi-pyo3/src/lib.rs b/yomikomi-pyo3/src/lib.rs
index f3d740c..59d5f57 100644
--- a/yomikomi-pyo3/src/lib.rs
+++ b/yomikomi-pyo3/src/lib.rs
@@ -214,6 +214,8 @@ struct Tokenize {
     report_bpb: bool,
     include_bos: bool,
     include_eos: bool,
+    bos_id: Option<u32>,
+    eos_id: Option<u32>,
 }
 
 impl Iterable for Tokenize {
@@ -227,6 +229,8 @@ impl Iterable for Tokenize {
             self.report_bpb,
             self.include_bos,
             self.include_eos,
+            self.bos_id,
+            self.eos_id,
         )
         .map_err(w)?;
         Ok(StreamIter { stream: Box::new(stream) })
@@ -409,7 +413,8 @@ impl YkIterable {
 
     /// Loads a sentencepiece tokenizer, and use it to tokenize the field passed as an argument of
     /// this function.
-    #[pyo3(signature = (path, *, in_field="text".to_string(), out_field=None, report_bpb=true, include_bos=true, include_eos=false))]
+    #[allow(clippy::too_many_arguments)]
+    #[pyo3(signature = (path, *, in_field="text".to_string(), out_field=None, report_bpb=true, include_bos=true, include_eos=false, bos_id=None, eos_id=None))]
     fn tokenize(
         &self,
         path: std::path::PathBuf,
@@ -418,6 +423,8 @@ impl YkIterable {
         report_bpb: bool,
         include_bos: bool,
         include_eos: bool,
+        bos_id: Option<u32>,
+        eos_id: Option<u32>,
     ) -> PyResult<Self> {
         let out_field = out_field.unwrap_or_else(|| in_field.clone());
         let inner = Tokenize {
@@ -428,6 +435,8 @@ impl YkIterable {
             report_bpb,
             include_bos,
             include_eos,
+            bos_id,
+            eos_id,
         };
         Ok(Self { inner: Arc::new(inner) })
     }
diff --git a/yomikomi/Cargo.toml b/yomikomi/Cargo.toml
index cbb6860..53619f0 100644
--- a/yomikomi/Cargo.toml
+++ b/yomikomi/Cargo.toml
@@ -15,4 +15,5 @@ sentencepiece = "0.11.2"
 serde_json = "1.0.108"
 symphonia = { version = "0.5.3", features = ["all-codecs"] }
 thiserror = "1.0.50"
+tokenizers = "0.21.0"
 zstd = "0.13.0"
diff --git a/yomikomi/src/error.rs b/yomikomi/src/error.rs
index 2e8de76..caa03d5 100644
--- a/yomikomi/src/error.rs
+++ b/yomikomi/src/error.rs
@@ -46,6 +46,9 @@ pub enum Error {
     #[error(transparent)]
     Io(#[from] std::io::Error),
 
+    #[error(transparent)]
+    Tokenizers(#[from] tokenizers::tokenizer::Error),
+
     /// Arbitrary errors wrapping.
     #[error(transparent)]
     Wrapped(Box<dyn std::error::Error + Send + Sync>),
diff --git a/yomikomi/src/strided_index.rs b/yomikomi/src/strided_index.rs
index d7ae08b..1aff318 100644
--- a/yomikomi/src/strided_index.rs
+++ b/yomikomi/src/strided_index.rs
@@ -27,7 +27,7 @@ impl<'a> StridedIndex<'a> {
     }
 }
 
-impl<'a> Iterator for StridedIndex<'a> {
+impl Iterator for StridedIndex<'_> {
     type Item = usize;
 
     fn next(&mut self) -> Option<Self::Item> {
diff --git a/yomikomi/src/tokenize.rs b/yomikomi/src/tokenize.rs
index 62e313c..9d9ccc9 100644
--- a/yomikomi/src/tokenize.rs
+++ b/yomikomi/src/tokenize.rs
@@ -1,9 +1,41 @@
-use crate::{Array, Error, Result, Stream};
+use crate::{Array, Error as E, Result, Stream};
 use sentencepiece::SentencePieceProcessor;
 use std::sync::{Arc, Mutex};
+use tokenizers::tokenizer::Tokenizer;
+
+enum Processor {
+    Tokenizers(Box<Tokenizer>),
+    SentencePiece(SentencePieceProcessor),
+}
+
+impl Processor {
+    fn bos_id(&self) -> Option<u32> {
+        match self {
+            Self::SentencePiece(p) => p.bos_id(),
+            Self::Tokenizers(_) => None,
+        }
+    }
+
+    fn eos_id(&self) -> Option<u32> {
+        match self {
+            Self::SentencePiece(p) => p.eos_id(),
+            Self::Tokenizers(_) => None,
+        }
+    }
+
+    fn encode(&self, str: &str) -> Result<Vec<u32>> {
+        let tokens: Vec<_> = match self {
+            Self::SentencePiece(p) => {
+                p.encode(str).map_err(E::wrap)?.iter().map(|v| v.id).collect()
+            }
+            Self::Tokenizers(p) => p.encode(str, false)?.get_ids().to_vec(),
+        };
+        Ok(tokens)
+    }
+}
 
 pub struct Tokenize<T> {
-    spp: Arc<SentencePieceProcessor>,
+    processor: Arc<Processor>,
     input: T,
     in_key: String,
     out_key: String,
@@ -11,9 +43,12 @@ pub struct Tokenize<T> {
     tokens_and_chars: Option<Mutex<(usize, usize)>>,
     include_bos: bool,
     include_eos: bool,
+    bos_id: Option<u32>,
+    eos_id: Option<u32>,
 }
 
 impl<T> Tokenize<T> {
+    #[allow(clippy::too_many_arguments)]
     pub fn new<P: AsRef<std::path::Path>>(
         path: P,
         input: T,
@@ -22,15 +57,23 @@ impl<T> Tokenize<T> {
         report_bpb: bool,
         include_bos: bool,
         include_eos: bool,
+        bos_id: Option<u32>,
+        eos_id: Option<u32>,
     ) -> Result<Self> {
-        let spp = SentencePieceProcessor::open(path).map_err(Error::wrap)?;
-        let nl_id = match spp.encode("\n").map_err(Error::wrap)?.last() {
+        let path = path.as_ref();
+        let processor = if path.extension().map_or(false, |v| v == "json") {
+            let inner = Box::new(Tokenizer::from_file(path)?);
+            Processor::Tokenizers(inner)
+        } else {
+            Processor::SentencePiece(SentencePieceProcessor::open(path).map_err(E::wrap)?)
+        };
+        let nl_id = match processor.encode("\n").map_err(E::wrap)?.last() {
             None => crate::bail!("no specific token id for newline"),
-            Some(p) => p.id,
+            Some(p) => *p,
         };
         let tokens_and_chars = if report_bpb { Some(Mutex::new((0, 0))) } else { None };
         Ok(Self {
-            spp: Arc::new(spp),
+            processor: Arc::new(processor),
             input,
             in_key,
             out_key,
@@ -38,6 +81,8 @@ impl<T> Tokenize<T> {
             tokens_and_chars,
             include_bos,
             include_eos,
+            bos_id,
+            eos_id,
         })
     }
 }
@@ -62,7 +107,8 @@ impl<T: Stream> Stream for Tokenize<T> {
         let text = String::from_utf8_lossy(values);
         let mut all_tokens = Vec::new();
         if self.include_bos {
-            if let Some(bos_id) = self.spp.bos_id() {
+            let bos_id = self.bos_id.or_else(|| self.processor.bos_id());
+            if let Some(bos_id) = bos_id {
                 all_tokens.push(bos_id)
             }
         }
@@ -72,7 +118,7 @@ impl<T: Stream> Stream for Tokenize<T> {
             if idx > 0 {
                 all_tokens.push(self.nl_id)
             }
-            let tokens = match self.spp.encode(text) {
+            let tokens = match self.processor.encode(text) {
                 Ok(tokens) => tokens,
                 Err(err) => {
                     eprintln!("tokenizer encode error {err:?}");
@@ -86,11 +132,12 @@ impl<T: Stream> Stream for Tokenize<T> {
                 bpb = Some(tokens_and_chars.0 as f64 / tokens_and_chars.1 as f64 / f64::ln(2.))
             };
             for token in tokens {
-                all_tokens.push(token.id)
+                all_tokens.push(token)
             }
         }
         if self.include_eos {
-            if let Some(eos_id) = self.spp.eos_id() {
+            let eos_id = self.eos_id.or_else(|| self.processor.eos_id());
+            if let Some(eos_id) = eos_id {
                 all_tokens.push(eos_id)
             }
         }