-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtokenizers_trainer.py
76 lines (60 loc) · 2.38 KB
/
tokenizers_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from datasets import load_dataset, concatenate_datasets
from tokenizers import (
Tokenizer,
models,
normalizers,
pre_tokenizers,
decoders,
trainers,
processors,
Regex,
)
def batch_iterator(dataset, dataset_size, batch_size):
for i in range(0, dataset_size, batch_size):
yield dataset[i : i + batch_size]["text"]
# https://github.com/huggingface/tokenizers/issues/640#issuecomment-792305076
def bpe_tokenizer_trainer(text, vocab_size, min_frequency=0, add_prefix_space=True, batch_size=50):
# Supply either path to txt file or list of strings as text arg
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.Whitespace(),
pre_tokenizers.Punctuation(),
pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space),
]
)
tokenizer.normalizer = normalizers.Sequence(
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " "),]
)
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
min_frequency=min_frequency,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
if isinstance(text, str):
# if user specified path to txt file as string
tokenizer.train(text, trainer=trainer)
else:
# text is a datasets Dataset
tokenizer.train_from_iterator(batch_iterator(text, len(text), batch_size), trainer=trainer)
tokenizer.post_processor = processors.RobertaProcessing(
sep=("</s>", tokenizer.token_to_id("</s>")), cls=("<s>", tokenizer.token_to_id("<s>"))
)
tokenizer.save("tokenizer.json", pretty=True)
# tokenizer.model.save("output_dir")
def pretokenizer_print(text):
# To print and check how pre tokenization looks like
pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
return pre_tokenizer.pre_tokenize_str(text)
dataset = load_dataset(
"text",
data_files={
"wiki": "/ceph/hpc/home/eufatonr/data/text/public/wiki.sv.docs",
"oscar_local": "/ceph/hpc/home/eufatonr/data/text/public/oscar.sv.docs",
},
cache_dir="cache_dataset",
)
dataset = concatenate_datasets([dataset["wiki"], dataset["oscar_local"]])
bpe_tokenizer_trainer(text=dataset, vocab_size=50260)