diff --git a/.github/workflows/ykpy-ci.yml b/.github/workflows/ykpy-ci.yml index c905d16..2edce21 100644 --- a/.github/workflows/ykpy-ci.yml +++ b/.github/workflows/ykpy-ci.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [x86_64, x86, aarch64, armv7, s390x, ppc64le] + target: [x86_64, x86, aarch64, armv7] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/Cargo.toml b/Cargo.toml index 87312dd..b0b6bea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.2.0" +version = "0.3.0" edition = "2021" description = "Dataloader for training large text models." repository = "https://github.com/kyutai-labs/yomikomi" diff --git a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi index 19792a9..c3dfff8 100644 --- a/yomikomi-pyo3/py_src/yomikomi/__init__.pyi +++ b/yomikomi-pyo3/py_src/yomikomi/__init__.pyi @@ -103,7 +103,18 @@ class YkIterable: """ """ pass - def tokenize(self, path, *, in_field=..., out_field=None, report_bpb=True, include_bos=True, include_eos=False): + def tokenize( + self, + path, + *, + in_field=..., + out_field=None, + report_bpb=True, + include_bos=True, + include_eos=False, + bos_id=None, + eos_id=None + ): """ Loads a sentencepiece tokenizer, and use it to tokenize the field passed as an argument of this function.