Skip to content

Commit

Permalink
Test languages exemplars canonical duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
moyogo committed Nov 1, 2022
1 parent 74645d9 commit b24630f
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion tests/test_data_languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
# limitations under the License.
#
import pytest
from collections import Counter
import unicodedata
from collections import defaultdict, Counter
from gflanguages import LoadLanguages


Expand All @@ -30,3 +31,21 @@ def test_languages_exemplars_duplicates(exemplar_name):
counts = sorted(counter.most_common(), key=lambda pair:
exemplar.index(pair[0]))
assert (counts == [(v, 1) for v in exemplar])


@pytest.mark.parametrize(
"exemplar_name",
["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_canonical_duplicates(exemplar_name):
for code, lang in LoadLanguages().items():
exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
normalized = defaultdict(set)

for g in exemplar:
if g[0] == "{" and g[-1] == "}":
g = g.lstrip("{").rstrip("}")
normalized[unicodedata.normalize("NFC", g)].add(g)

for n, gs in normalized.items():
assert len(gs) == 1

0 comments on commit b24630f

Please sign in to comment.