-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
tokenizer_exceptions.py
97 lines (90 loc) · 1.37 KB
/
tokenizer_exceptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH
from .punctuation import _make_ro_variants
_exc = {}
# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
for orth in [
"1-a",
"2-a",
"3-a",
"4-a",
"5-a",
"6-a",
"7-a",
"8-a",
"9-a",
"10-a",
"11-a",
"12-a",
"1-ul",
"2-lea",
"3-lea",
"4-lea",
"5-lea",
"6-lea",
"7-lea",
"8-lea",
"9-lea",
"10-lea",
"11-lea",
"12-lea",
"d-voastră",
"dvs.",
"ing.",
"dr.",
"Rom.",
"str.",
"nr.",
"etc.",
"d.p.d.v.",
"dpdv",
"șamd.",
"ș.a.m.d.",
# below: from UD_Romanian-RRT:
"A.c.",
"A.f.",
"A.r.",
"Al.",
"Art.",
"Aug.",
"Bd.",
"Dem.",
"Dr.",
"Fig.",
"Fr.",
"Gh.",
"Gr.",
"Lt.",
"Nr.",
"Obs.",
"Prof.",
"Sf.",
"a.m.",
"a.r.",
"alin.",
"art.",
"d-l",
"d-lui",
"d-nei",
"ex.",
"fig.",
"ian.",
"lit.",
"lt.",
"p.a.",
"p.m.",
"pct.",
"prep.",
"sf.",
"tel.",
"univ.",
"îngr.",
"într-adevăr",
"Șt.",
"ș.a.",
]:
# note: does not distinguish capitalized-only exceptions from others
for variant in _make_ro_variants([orth]):
_exc[variant] = [{ORTH: variant}]
TOKENIZER_EXCEPTIONS = _exc