-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Switch to ICU tokenizer #939
base: main
Are you sure you want to change the base?
Changes from all commits
4041e04
070c0d3
4000853
13a80bf
bb7f523
d585a63
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,3 +3,4 @@ opus-fast-mosestokenizer==0.0.8.5 | |
tqdm | ||
requests==2.31.0 | ||
zstandard | ||
PyICU==2.8.1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,15 +4,22 @@ | |
|
||
Example: | ||
python pipeline/alignments/tokenizer.py --input_path=data/datasets/news.2023.en.shuffled.deduped \ | ||
--output_path=data/datasets/news.2023.en.shuffled.deduped.tok-moses --lang=en --chunk_size=500000 | ||
--output_path=data/datasets/news.2023.en.shuffled.deduped.tok-icu --lang=en --chunk_size=500000 --tokenizer=icu | ||
|
||
Using C++ opus-fast-mosestokenizer sometimes requires specifying LD_LIBRARY_PATH before starting the Python process | ||
see https://github.com/Helsinki-NLP/opus-fast-mosestokenizer/issues/6 | ||
export LD_LIBRARY_PATH=.../<you-python-env>/lib/python3.10/site-packages/mosestokenizer/lib | ||
|
||
Using ICU tokenizer requires installing it with `apt-get install python3-icu`, | ||
see more installation instructions here: https://pypi.org/project/PyICU/ | ||
|
||
Whitespaces are ignored by Moses based tokenizers and preserved and replaced with a special token "▁" by ICU tokenizer | ||
which allows lossless reconstruction of the original text on detokenization. | ||
|
||
""" | ||
import argparse | ||
import multiprocessing | ||
from enum import Enum | ||
from typing import List | ||
|
||
from tqdm import tqdm | ||
|
@@ -22,6 +29,99 @@ | |
logger = get_logger("tokenizer") | ||
|
||
|
||
class TokenizerType(Enum): | ||
fast_moses = "fast_moses" | ||
sacre_moses = "sacre_moses" | ||
icu = "icu" | ||
|
||
|
||
class Tokenizer: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be using Abstract Base Classes? https://docs.python.org/3/library/abc.html |
||
def __init__(self, lang: str): | ||
self.lang = lang | ||
|
||
def tokenize(self, text: str) -> List[str]: | ||
pass | ||
|
||
def detokenize(self, tokens: List[str]) -> str: | ||
pass | ||
|
||
|
||
class FastMosesTokenizer(Tokenizer): | ||
""" | ||
Uses Moses tokenizer https://github.com/Helsinki-NLP/opus-fast-mosestokenizer | ||
""" | ||
|
||
def __init__(self, lang): | ||
super().__init__(lang) | ||
from mosestokenizer import MosesTokenizer | ||
|
||
try: | ||
self.tokenizer = MosesTokenizer(lang) | ||
except RuntimeError as err: | ||
msg = str(err) | ||
if "No known abbreviations for language" in msg: | ||
# Fall-back to English if the language is not found | ||
self.tokenizer = MosesTokenizer("en") | ||
else: | ||
raise err | ||
|
||
def tokenize(self, text: str) -> List[str]: | ||
return self.tokenizer.tokenize(text) | ||
|
||
def detokenize(self, tokens: List[str]) -> str: | ||
return self.tokenizer.detokenize(tokens) | ||
|
||
|
||
class SacreMosesTokenizer(Tokenizer): | ||
""" | ||
Uses Moses tokenizer https://github.com/hplt-project/sacremoses | ||
""" | ||
|
||
def __init__(self, lang): | ||
super().__init__(lang) | ||
import sacremoses | ||
|
||
self.tokenizer = sacremoses.MosesTokenizer(lang) | ||
self.detokenizer = sacremoses.MosesDetokenizer(lang) | ||
|
||
def tokenize(self, text: str) -> List[str]: | ||
return self.tokenizer.tokenize(text) | ||
|
||
def detokenize(self, tokens: List[str]) -> str: | ||
return self.detokenizer.detokenize(tokens) | ||
|
||
|
||
class IcuTokenizer(Tokenizer): | ||
""" | ||
Uses ICU based word segmenter https://pypi.org/project/PyICU/ | ||
Preserves whitespaces as tokens by replacing them with a special character "▁". | ||
Allows lossless reconstruction of the original text on detokenization. | ||
""" | ||
|
||
# Same character is used by SentencePiece | ||
SPACE_TOKEN = "▁" | ||
|
||
def tokenize(self, text: str) -> List[str]: | ||
from icu import BreakIterator, Locale | ||
|
||
bi = BreakIterator.createWordInstance(Locale(self.lang)) | ||
bi.setText(text) | ||
|
||
tokens = [] | ||
start = bi.first() | ||
for end in bi: | ||
token = text[start:end] | ||
if ( | ||
token and token != "\n" | ||
): # exclude empty tokens, but leave whitespaces and replace them with a special token | ||
tokens.append(token.replace(" ", self.SPACE_TOKEN)) | ||
start = end | ||
return tokens | ||
|
||
def detokenize(self, tokens: List[str]) -> str: | ||
return "".join(tokens).replace(self.SPACE_TOKEN, " ") | ||
|
||
|
||
def _read_file_in_chunks(file_path, chunk_size): | ||
with open(file_path, "r", encoding="utf-8") as file: | ||
while True: | ||
|
@@ -32,18 +132,16 @@ def _read_file_in_chunks(file_path, chunk_size): | |
|
||
|
||
def _tokenize_lines(params) -> List[str]: | ||
lines, lang = params | ||
from mosestokenizer import MosesTokenizer | ||
|
||
try: | ||
tokenizer = MosesTokenizer(lang) | ||
except RuntimeError as err: | ||
msg = str(err) | ||
if "No known abbreviations for language" in msg: | ||
# Fall-back to English if the language is not found | ||
tokenizer = MosesTokenizer("en") | ||
else: | ||
raise err | ||
lines, lang, tok_type = params | ||
|
||
if tok_type == TokenizerType.fast_moses: | ||
tokenizer = FastMosesTokenizer(lang) | ||
elif tok_type == TokenizerType.sacre_moses: | ||
tokenizer = SacreMosesTokenizer(lang) | ||
elif tok_type == TokenizerType.icu: | ||
tokenizer = IcuTokenizer(lang) | ||
else: | ||
raise ValueError(f"Unknown tokenizer type: {tok_type}") | ||
|
||
tokenized = [] | ||
for line in lines: | ||
|
@@ -52,8 +150,12 @@ def _tokenize_lines(params) -> List[str]: | |
return tokenized | ||
|
||
|
||
def tokenize_moses( | ||
input_path: str, output_path: str, lang: str, sentences_per_chunk: int = 100000 | ||
def tokenize( | ||
input_path: str, | ||
output_path: str, | ||
lang: str, | ||
tokenizer: TokenizerType, | ||
sentences_per_chunk: int = 100000, | ||
) -> None: | ||
logger.info(f"Tokenizing {input_path} with Moses tokenizer") | ||
|
||
|
@@ -65,7 +167,7 @@ def tokenize_moses( | |
# ~100K sentences per second on a single core | ||
for tokenized_chunk in pool.imap( | ||
_tokenize_lines, | ||
((ch, lang) for ch in chunks), | ||
((ch, lang, tokenizer) for ch in chunks), | ||
): | ||
output_file.write("\n".join(tokenized_chunk) + "\n") | ||
pbar.update(len(tokenized_chunk)) | ||
|
@@ -104,5 +206,19 @@ def tokenize_moses( | |
default=None, | ||
help="Number of lines to process per chunk", | ||
) | ||
parser.add_argument( | ||
"--tokenizer", | ||
metavar="TOKENIZER", | ||
type=TokenizerType, | ||
choices=TokenizerType, | ||
default=TokenizerType.icu, | ||
help="Tokenization method", | ||
) | ||
args = parser.parse_args() | ||
tokenize_moses(args.input_path, args.output_path, args.lang, args.chunk_size) | ||
tokenize( | ||
input_path=args.input_path, | ||
output_path=args.output_path, | ||
lang=args.lang, | ||
sentences_per_chunk=args.chunk_size, | ||
tokenizer=args.tokenizer, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# use the latest main, switch to PyPi when released | ||
git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21 | ||
# ICU tokenizer commit | ||
git+https://github.com/mozilla/OpusTrainer.git@ee534f34e2267c751f4686d7bae27673564c547b | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a blocking piece feedback on the changes in the fork. I'm leaving comments in the other PR. The issue is that the BreakIterator is not being cached, which is a performance issue. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's file an issue for getting back on upstream, and reference here. I'm assuming this work will be attempted to PR against the upstream. |
||
simalign==0.4 | ||
mtdata==0.4.1 | ||
psutil==6.0.0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo:
reprenseted