diff --git a/spacy_stanza/about.py b/spacy_stanza/about.py index ce10944..62feb97 100644 --- a/spacy_stanza/about.py +++ b/spacy_stanza/about.py @@ -1,5 +1,5 @@ __title__ = "spacy-stanza" -__version__ = "0.2.0" +__version__ = "0.2.1" __summary__ = "Use the latest Stanza (StanfordNLP) research models directly in spaCy" __uri__ = "https://explosion.ai" __author__ = "Ines Montani" diff --git a/spacy_stanza/language.py b/spacy_stanza/language.py index 2f35efc..795787b 100644 --- a/spacy_stanza/language.py +++ b/spacy_stanza/language.py @@ -10,6 +10,7 @@ import numpy import re +import warnings class StanzaLanguage(Language): @@ -171,9 +172,17 @@ def __call__(self, text): ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) - if ent_span: - ents.append(ent_span) - doc.ents = ents + ents.append(ent_span) + if not all(ents): + warnings.warn( + f"Can't set named entities because the character offsets don't " + f"map to valid tokens produced by the Stanza tokenizer:\n" + f"Words: {words}\n" + f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", + stacklevel=4, + ) + else: + doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) diff --git a/tests/test_language.py b/tests/test_language.py index d92b621..1da36d4 100644 --- a/tests/test_language.py +++ b/tests/test_language.py @@ -6,18 +6,14 @@ import pytest -@pytest.fixture -def lang(): - return "en" - - def tags_equal(act, exp): """Check if each actual tag in act is equal to one or more expected tags in exp.""" return all(a == e if isinstance(e, str) else a in e for a, e in zip(act, exp)) -def test_spacy_stanza(lang): - stanza.download(lang) +def test_spacy_stanza_english(): + lang = "en" + stanza.download() snlp = stanza.Pipeline(lang=lang) nlp = StanzaLanguage(snlp) assert nlp.lang == "stanza_" + lang @@ -61,6 +57,15 @@ def test_spacy_stanza(lang): assert doc.ents[1].label_ == "GPE" +def test_spacy_stanza_german(): + lang = "de" + stanza.download(lang) + snlp = stanza.Pipeline(lang=lang) + nlp = StanzaLanguage(snlp) + with pytest.warns(UserWarning): + doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten") + + def test_get_defaults(): assert get_defaults("en") == EnglishDefaults assert get_defaults("xvkfokdfo") == BaseDefaults