-
Notifications
You must be signed in to change notification settings - Fork 1
/
test_tokenizer.py
56 lines (53 loc) · 1.63 KB
/
test_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pytest
from alt_eval.tokenizer import LyricsTokenizer
# fmt: off
@pytest.mark.parametrize(
"language, text, expected_tokens",
[
(
"en",
"I ain't got nothin' but the blues",
["I", "ain", "'t", "got", "nothin'", "but", "the", "blues"],
),
(
"en",
"It'll be fun (ha!)",
["It", "'ll", "be", "fun", "(", "ha", "!", ")"]
),
(
"en",
"Just like 2Pac",
["Just", "like", "2Pac"],
),
(
"de",
"Sei's Melancholie",
["Sei", "'s", "Melancholie"]
),
(
"de",
"Könnt' ich dir Schmerz erspar'n",
["Könnt'", "ich", "dir", "Schmerz", "erspar'n"],
),
(
"fr",
"T'avais fait l'amour deux fois sans penser qu'avec cette fille-là",
["T'", "avais", "fait", "l'", "amour", "deux", "fois", "sans", "penser", "qu'", "avec", "cette", "fille", "-", "là"],
),
(
"ja",
"私は日本語を話せません(ラララ)",
["私", "は", "日", "本", "語", "を", "話", "せ", "ま", "せ", "ん", "(", "ラ", "ラ", "ラ", ")"],
),
(
"zh",
"我不会说中文。(哈哈)",
["我", "不", "会", "说", "中", "文", "。", "(", "哈", "哈", ")"],
)
],
)
# fmt: on
def test_lyrics_tokenizer(language, text, expected_tokens):
tokenizer = LyricsTokenizer()
tokens = [t.text for t in tokenizer(text, language=language)]
assert tokens == expected_tokens