Skip to content

Commit

Permalink
Test that the Latin ITTB lemmatizer is marked as caseless. Check that…
Browse files Browse the repository at this point in the history
… the results for capitalized text is as expected
  • Loading branch information
AngledLuffa committed Jan 14, 2024
1 parent 36781b6 commit 6ca20bf
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
14 changes: 14 additions & 0 deletions stanza/tests/pipeline/test_lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,17 @@ def test_caseless_lemmatizer():
doc = nlp("Jennifer has nice Antennae")
assert doc.sentences[0].words[-1].lemma == 'antenna'

def test_latin_caseless_lemmatizer():
"""
Test the Latin caseless lemmatizer
"""
nlp = stanza.Pipeline('la', package='ittb', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None)
lemmatizer = nlp.processors['lemma']
assert lemmatizer.config['caseless']

doc = nlp("Quod Erat Demonstrandum")
expected_lemmas = "qui sum demonstro".split()
assert len(doc.sentences) == 1
assert len(doc.sentences[0].words) == 3
for word, expected in zip(doc.sentences[0].words, expected_lemmas):
assert word.lemma == expected
2 changes: 2 additions & 0 deletions stanza/tests/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
stanza.download(lang='en', model_dir=models_dir, logging_level='info')
stanza.download(lang="en", model_dir=models_dir, package=None, processors={"ner":"ncbi_disease"})
stanza.download(lang='fr', model_dir=models_dir, logging_level='info')
# Latin ITTB has no case information for the lemmatizer
stanza.download(lang='la', model_dir=models_dir, package='ittb', logging_level='info')
stanza.download(lang='zh', model_dir=models_dir, logging_level='info')
# useful not just for verifying RtL, but because the default Arabic has a unique style of xpos tags
stanza.download(lang='ar', model_dir=models_dir, logging_level='info')
Expand Down

0 comments on commit 6ca20bf

Please sign in to comment.