diff --git a/CHANGELOG.md b/CHANGELOG.md index 7369aa7e2..ba363cd07 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- Fixed default value of `--tokenizer` argument to `scripts/prepare_tulu_data.py` to be an absolute path, not relative path, the script can be run from other directories. + ## [v0.2.4](https://github.com/allenai/OLMo/releases/tag/v0.2.4) - 2024-02-02 ### Fixed diff --git a/scripts/prepare_tulu_data.py b/scripts/prepare_tulu_data.py index 4eba35945..7994b406f 100644 --- a/scripts/prepare_tulu_data.py +++ b/scripts/prepare_tulu_data.py @@ -116,7 +116,7 @@ def get_parser() -> ArgumentParser: "--tokenizer", type=str, help="""Tokenizer path or identifier.""", - default="tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json", + default=Path(__file__).parent / "tokenizers" / "allenai_eleuther-ai-gpt-neox-20b-pii-special.json", ) parser.add_argument("-s", "--seq-len", type=int, help="""Max sequence length.""", default=2048) parser.add_argument("--eos", type=int, help="""EOS token ID.""", default=50279)