Skip to content

Commit

Permalink
Integrate huggingface tokenizers. (2nd PR) (#11)
Browse files Browse the repository at this point in the history
* Moving tokenizer files to a separate folder.

* Update codes for downloading a tokenizer.

* Integrate huggingface tokenizers.
  • Loading branch information
Knarik1 authored Aug 29, 2024
1 parent 5cd10c7 commit 17d7d0a
Show file tree
Hide file tree
Showing 19 changed files with 101,621 additions and 103 deletions.
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,8 @@ Once you have confirmed access, you can run the following command to download th
```bash
# Get your HF token from https://huggingface.co/settings/tokens

# llama3 or 3.1 tokenizer.model
python torchtitan/datasets/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3-8B --tokenizer_path "original" --hf_token=...

# llama2 tokenizer.model
python torchtitan/datasets/download_tokenizer.py --repo_id meta-llama/Llama-2-13b-hf --hf_token=...
# chemlactica-125m
python torchtitan/tokenizers/download_tokenizer.py --repo_id yerevann/chemlactica-125m
```

### Start a training run
Expand Down
2 changes: 1 addition & 1 deletion estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from torch.testing._internal.distributed.fake_pg import FakeStore

from torchtitan.config_manager import JobConfig
from torchtitan.datasets import build_tokenizer
from torchtitan.tokenizers.tokenizer import build_tokenizer
from torchtitan.float8 import Float8Handler
from torchtitan.logging import init_logger, logger
from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config
Expand Down
4 changes: 2 additions & 2 deletions test/datasets/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import torch
from torchtitan.datasets.hf_datasets import build_hf_data_loader
from torchtitan.datasets.tokenizer import build_tokenizer
from torchtitan.tokenizers.tokenizer import build_tokenizer


class TestCheckpoint:
Expand Down Expand Up @@ -42,7 +42,7 @@ def _build_dataloader(
self, dataset_name, dataset_path, batch_size, seq_len, world_size, rank
):
tokenizer_type = "tiktoken"
tokenizer = build_tokenizer("tiktoken", "./test/assets/test_tiktoken.model")
tokenizer = build_tokenizer("tiktoken", "./torchtitan/tokenizers/chemlactica-125m")
return build_hf_data_loader(
dataset_name=dataset_name,
dataset_path=dataset_path,
Expand Down
4 changes: 1 addition & 3 deletions torchtitan/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
# LICENSE file in the root directory of this source tree.

from torchtitan.datasets.hf_datasets import build_hf_data_loader
from torchtitan.datasets.tokenizer import build_tokenizer

__all__ = [
"build_hf_data_loader",
"build_tokenizer",
"build_hf_data_loader"
]
65 changes: 0 additions & 65 deletions torchtitan/datasets/download_tokenizer.py

This file was deleted.

2 changes: 1 addition & 1 deletion torchtitan/datasets/hf_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"pip3 install --pre torchdata --index-url https://download.pytorch.org/whl/nightly"
) from e

from torchtitan.datasets.tokenizer import Tokenizer
from torchtitan.tokenizers.tokenizer import Tokenizer
from torchtitan.logging import logger

from datasets import load_dataset
Expand Down
21 changes: 0 additions & 21 deletions torchtitan/datasets/tokenizer/__init__.py

This file was deleted.

91 changes: 91 additions & 0 deletions torchtitan/tokenizers/chemlactica-125m/special_tokens_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"additional_special_tokens": [
"[SYNONYM]",
"[RELATED]",
"[SIMILAR]",
"[PROPERTY]",
"[SAS]",
"[WEIGHT]",
"[TPSA]",
"[CLOGP]",
"[QED]",
"[NUMHDONORS]",
"[NUMHACCEPTORS]",
"[NUMHETEROATOMS]",
"[NUMROTATABLEBONDS]",
"[NOCOUNT]",
"[NHOHCOUNT]",
"[RINGCOUNT]",
"[HEAVYATOMCOUNT]",
"[FRACTIONCSP3]",
"[NUMAROMATICRINGS]",
"[NUMSATURATEDRINGS]",
"[NUMAROMATICHETEROCYCLES]",
"[NUMAROMATICCARBOCYCLES]",
"[NUMSATURATEDHETEROCYCLES]",
"[NUMSATURATEDCARBOCYCLES]",
"[NUMALIPHATICRINGS]",
"[NUMALIPHATICHETEROCYCLES]",
"[NUMALIPHATICCARBOCYCLES]",
"[IUPAC]",
"[VAR_NAME]",
"[VAR_DESC]",
"[VAR_VAL]",
"[ASSAY_NAME]",
"[ASSAY_DESC]",
"[/SYNONYM]",
"[/RELATED]",
"[/SIMILAR]",
"[/PROPERTY]",
"[/SAS]",
"[/WEIGHT]",
"[/TPSA]",
"[/CLOGP]",
"[/QED]",
"[/NUMHDONORS]",
"[/NUMHACCEPTORS]",
"[/NUMHETEROATOMS]",
"[/NUMROTATABLEBONDS]",
"[/NOCOUNT]",
"[/NHOHCOUNT]",
"[/RINGCOUNT]",
"[/HEAVYATOMCOUNT]",
"[/FRACTIONCSP3]",
"[/NUMAROMATICRINGS]",
"[/NUMSATURATEDRINGS]",
"[/NUMAROMATICHETEROCYCLES]",
"[/NUMAROMATICCARBOCYCLES]",
"[/NUMSATURATEDHETEROCYCLES]",
"[/NUMSATURATEDCARBOCYCLES]",
"[/NUMALIPHATICRINGS]",
"[/NUMALIPHATICHETEROCYCLES]",
"[/NUMALIPHATICCARBOCYCLES]",
"[/IUPAC]",
"[/VAR_NAME]",
"[/VAR_DESC]",
"[/VAR_VAL]",
"[/ASSAY_NAME]",
"[/ASSAY_DESC]"
],
"bos_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}
Loading

0 comments on commit 17d7d0a

Please sign in to comment.