diff --git a/stanza/tests/pipeline/test_lemmatizer.py b/stanza/tests/pipeline/test_lemmatizer.py index 2c75be0ce2..7c53d81340 100644 --- a/stanza/tests/pipeline/test_lemmatizer.py +++ b/stanza/tests/pipeline/test_lemmatizer.py @@ -120,3 +120,17 @@ def test_caseless_lemmatizer(): doc = nlp("Jennifer has nice Antennae") assert doc.sentences[0].words[-1].lemma == 'antenna' +def test_latin_caseless_lemmatizer(): + """ + Test the Latin caseless lemmatizer + """ + nlp = stanza.Pipeline('la', package='ittb', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None) + lemmatizer = nlp.processors['lemma'] + assert lemmatizer.config['caseless'] + + doc = nlp("Quod Erat Demonstrandum") + expected_lemmas = "qui sum demonstro".split() + assert len(doc.sentences) == 1 + assert len(doc.sentences[0].words) == 3 + for word, expected in zip(doc.sentences[0].words, expected_lemmas): + assert word.lemma == expected diff --git a/stanza/tests/setup.py b/stanza/tests/setup.py index 607d63a88a..444f979be2 100644 --- a/stanza/tests/setup.py +++ b/stanza/tests/setup.py @@ -40,6 +40,8 @@ stanza.download(lang='en', model_dir=models_dir, logging_level='info') stanza.download(lang="en", model_dir=models_dir, package=None, processors={"ner":"ncbi_disease"}) stanza.download(lang='fr', model_dir=models_dir, logging_level='info') +# Latin ITTB has no case information for the lemmatizer +stanza.download(lang='la', model_dir=models_dir, package='ittb', logging_level='info') stanza.download(lang='zh', model_dir=models_dir, logging_level='info') # useful not just for verifying RtL, but because the default Arabic has a unique style of xpos tags stanza.download(lang='ar', model_dir=models_dir, logging_level='info')