Merge pull request #8787 from adrianeboyd/chore/backport-v3.0.7

Backport bug fixes to v3.0.x
explosion · Jul 21, 2021 · 034ac0a · 034ac0a
2 parents 02e1892 + 0080454
commit 034ac0a
Show file tree

Hide file tree

Showing 33 changed files with 209 additions and 105 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
 recursive-include spacy/cli *.json *.yml
 recursive-include licenses *
+recursive-exclude spacy *.cpp
diff --git a/spacy/about.py b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.6"
+__version__ = "3.0.7"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
@@ -115,7 +115,8 @@ def convert(
     ner_map = srsly.read_json(ner_map) if ner_map is not None else None
     doc_files = []
     for input_loc in walk_directory(Path(input_path), converter):
-        input_data = input_loc.open("r", encoding="utf-8").read()
+        with input_loc.open("r", encoding="utf-8") as infile:
+            input_data = infile.read()
         # Use converter function to convert data
         func = CONVERTERS[converter]
         docs = func(

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
@@ -18,7 +18,7 @@ def package_cli(
     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
     code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
     meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
-    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
+    create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
     version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
     build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
@@ -418,7 +418,7 @@ compound = 1.001
 
 [initialize]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
-vectors = null
+vectors = ${paths.vectors}
 {% else -%}
 vectors = "{{ word_vectors }}"
 {% endif -%}
diff --git a/spacy/errors.py b/spacy/errors.py
@@ -518,6 +518,11 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E867 = ("The 'textcat' component requires at least two labels because it "
+            "uses mutually exclusive classes where exactly one label is True "
+            "for each doc. For binary classification tasks, you can use two "
+            "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
+            "can use the 'textcat_multilabel' component with one label.")
     E870 = ("Could not serialize the DocBin because it is too large. Consider "
             "splitting up your documents into several doc bins and serializing "
             "each separately. spacy.Corpus.v1 will search recursively for all "

diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py
@@ -1,16 +1,11 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .stop_words import STOP_WORDS
-from .syntax_iterators import SYNTAX_ITERATORS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 
 
 class AzerbaijaniDefaults(Language.Defaults):
-    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
-    token_match = TOKEN_MATCH
-    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Azerbaijani(Language):

diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py
@@ -57,6 +57,6 @@ def rule_lemmatize(self, token: Token) -> List[str]:
             forms.extend(oov_forms)
         if not forms:
             forms.append(string)
-        forms = list(set(forms))
+        forms = list(dict.fromkeys(forms))
         self.cache[cache_key] = forms
         return forms
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
@@ -12,7 +12,6 @@
 
 
 class RussianLemmatizer(Lemmatizer):
-    _morph = None
 
     def __init__(
         self,
@@ -31,8 +30,8 @@ def __init__(
                     "The Russian lemmatizer mode 'pymorphy2' requires the "
                     "pymorphy2 library. Install it with: pip install pymorphy2"
                 ) from None
-            if RussianLemmatizer._morph is None:
-                RussianLemmatizer._morph = MorphAnalyzer()
+            if getattr(self, "_morph", None) is None:
+                self._morph = MorphAnalyzer()
         super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
 
     def pymorphy2_lemmatize(self, token: Token) -> List[str]:

diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
@@ -7,8 +7,6 @@
 
 
 class UkrainianLemmatizer(RussianLemmatizer):
-    _morph = None
-
     def __init__(
         self,
         vocab: Vocab,
@@ -27,6 +25,6 @@ def __init__(
                     "pymorphy2 library and dictionaries. Install them with: "
                     "pip install pymorphy2 pymorphy2-dicts-uk"
                 ) from None
-            if UkrainianLemmatizer._morph is None:
-                UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
+            if getattr(self, "_morph", None) is None:
+                self._morph = MorphAnalyzer(lang="uk")
         super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
@@ -50,6 +50,8 @@ cdef class PhraseMatcher:
         if isinstance(attr, (int, long)):
             self.attr = attr
         else:
+            if attr is None:
+                attr = "ORTH"
             attr = attr.upper()
             if attr == "TEXT":
                 attr = "ORTH"

diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
@@ -3,7 +3,7 @@
 from thinc.api import MultiSoftmax, list2array
 from thinc.api import to_categorical, CosineDistance, L2Distance
 
-from ...util import registry
+from ...util import registry, OOV_RANK
 from ...errors import Errors
 from ...attrs import ID
 
@@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
     # and look them up all at once. This prevents data copying.
     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
     target = docs[0].vocab.vectors.data[ids]
+    target[ids == OOV_RANK] = 0
     d_target, loss = distance(prediction, target)
     return loss, d_target
 

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
@@ -481,7 +481,8 @@ def from_disk(
 
         def load_model(p):
             try:
-                self.model.from_bytes(p.open("rb").read())
+                with p.open("rb") as infile:
+                    self.model.from_bytes(infile.read())
             except AttributeError:
                 raise ValueError(Errors.E149) from None
 

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 from pathlib import Path
 import srsly
+import warnings
 
 from .pipe import Pipe
 from ..training import Example
@@ -102,17 +103,12 @@ def __init__(
         self.overwrite = overwrite_ents
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
+        self._validate = validate
         self.matcher = Matcher(nlp.vocab, validate=validate)
-        if phrase_matcher_attr is not None:
-            if phrase_matcher_attr.upper() == "TEXT":
-                phrase_matcher_attr = "ORTH"
-            self.phrase_matcher_attr = phrase_matcher_attr
-            self.phrase_matcher = PhraseMatcher(
-                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-            )
-        else:
-            self.phrase_matcher_attr = None
-            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
+        self.phrase_matcher_attr = phrase_matcher_attr
+        self.phrase_matcher = PhraseMatcher(
+            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
+        )
         self.ent_id_sep = ent_id_sep
         self._ent_ids = defaultdict(dict)
         if patterns is not None:
@@ -146,7 +142,9 @@ def __call__(self, doc: Doc) -> Doc:
 
     def match(self, doc: Doc):
         self._require_patterns()
-        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="\\[W036")
+            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
         matches = set(
             [(m_id, start, end) for m_id, start, end in matches if start != end]
         )
@@ -281,7 +279,7 @@ def add_patterns(self, patterns: List[PatternType]) -> None:
                     current_index = i
                     break
             subsequent_pipes = [
-                pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
+                pipe for pipe in self.nlp.pipe_names[current_index :]
             ]
         except ValueError:
             subsequent_pipes = []
@@ -317,20 +315,22 @@ def add_patterns(self, patterns: List[PatternType]) -> None:
                 pattern = entry["pattern"]
                 if isinstance(pattern, Doc):
                     self.phrase_patterns[label].append(pattern)
+                    self.phrase_matcher.add(label, [pattern])
                 elif isinstance(pattern, list):
                     self.token_patterns[label].append(pattern)
+                    self.matcher.add(label, [pattern])
                 else:
                     raise ValueError(Errors.E097.format(pattern=pattern))
-            for label, patterns in self.token_patterns.items():
-                self.matcher.add(label, patterns)
-            for label, patterns in self.phrase_patterns.items():
-                self.phrase_matcher.add(label, patterns)
 
     def clear(self) -> None:
         """Reset all patterns."""
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
         self._ent_ids = defaultdict(dict)
+        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
+        self.phrase_matcher = PhraseMatcher(
+            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
+        )
 
     def _require_patterns(self) -> None:
         """Raise a warning if this component has no patterns defined."""
@@ -381,10 +381,9 @@ def from_bytes(
             self.add_patterns(cfg.get("patterns", cfg))
             self.overwrite = cfg.get("overwrite", False)
             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            if self.phrase_matcher_attr is not None:
-                self.phrase_matcher = PhraseMatcher(
-                    self.nlp.vocab, attr=self.phrase_matcher_attr
-                )
+            self.phrase_matcher = PhraseMatcher(
+                self.nlp.vocab, attr=self.phrase_matcher_attr
+            )
             self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
         else:
             self.add_patterns(cfg)
@@ -435,10 +434,9 @@ def from_disk(
             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
             self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 
-            if self.phrase_matcher_attr is not None:
-                self.phrase_matcher = PhraseMatcher(
-                    self.nlp.vocab, attr=self.phrase_matcher_attr
-                )
+            self.phrase_matcher = PhraseMatcher(
+                self.nlp.vocab, attr=self.phrase_matcher_attr
+            )
             from_disk(path, deserializers_patterns, {})
         return self
 

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
@@ -332,6 +332,8 @@ def initialize(
         else:
             for label in labels:
                 self.add_label(label)
+        if len(self.labels) < 2:
+            raise ValueError(Errors.E867)
         if positive_label is not None:
             if positive_label not in self.labels:
                 err = Errors.E920.format(pos_label=positive_label, labels=self.labels)

diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
@@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
 
         def load_model(p):
             try:
-                self.model.from_bytes(p.open("rb").read())
+                with open(p, "rb") as mfile:
+                    self.model.from_bytes(mfile.read())
             except AttributeError:
                 raise ValueError(Errors.E149) from None
 

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
@@ -351,17 +351,25 @@ def test_doc_from_array_morph(en_vocab):
 
 @pytest.mark.usefixtures("clean_underscore")
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
-    en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
+    en_texts = [
+        "Merging the docs is fun.",
+        "",
+        "They don't think alike. ",
+        "Another doc.",
+    ]
     en_texts_without_empty = [t for t in en_texts if len(t)]
     de_text = "Wie war die Frage?"
     en_docs = [en_tokenizer(text) for text in en_texts]
     en_docs[0].spans["group"] = [en_docs[0][1:4]]
     en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
+    en_docs[3].spans["group"] = [en_docs[3][0:1]]
+    span_group_texts = sorted(
+        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+    )
     de_doc = de_tokenizer(de_text)
     Token.set_extension("is_ambiguous", default=False)
-    en_docs[0][2]._.is_ambiguous = True # docs
-    en_docs[2][3]._.is_ambiguous = True # think
+    en_docs[0][2]._.is_ambiguous = True  # docs
+    en_docs[2][3]._.is_ambiguous = True  # think
     assert Doc.from_docs([]) is None
     assert de_doc is not Doc.from_docs([de_doc])
     assert str(de_doc) == str(Doc.from_docs([de_doc]))
@@ -371,8 +379,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
 
     m_doc = Doc.from_docs(en_docs)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -384,11 +392,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert not any([t._.is_ambiguous for t in m_doc[3:8]])
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
+    assert bool(m_doc[11].whitespace_)
 
     m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) == sum(len(t) for t in en_texts)
-    assert str(m_doc) == "".join(en_texts)
+    assert len(m_doc.text) == sum(len(t) for t in en_texts)
+    assert m_doc.text == "".join(en_texts_without_empty)
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and not bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -397,11 +406,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
+    assert bool(m_doc[11].whitespace_)
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
     # space delimiter considered, although spacy attribute was missing
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -414,6 +424,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     # can merge empty docs
     doc = Doc.from_docs([en_tokenizer("")] * 10)
 
+    # empty but set spans keys are preserved
+    en_docs = [en_tokenizer(text) for text in en_texts]
+    m_doc = Doc.from_docs(en_docs)
+    assert "group" not in m_doc.spans
+    for doc in en_docs:
+        doc.spans["group"] = []
+    m_doc = Doc.from_docs(en_docs)
+    assert "group" in m_doc.spans
+    assert len(m_doc.spans["group"]) == 0
+
 
 def test_doc_api_from_docs_ents(en_tokenizer):
     texts = ["Merging the docs is fun.", "They don't think alike."]

diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
@@ -4,12 +4,13 @@
 
 # fmt: off
 # Only include languages with no external dependencies
-# excluded: ja, ru, th, uk, vi, zh
-LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
-             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
-             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
-             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
-             "yo"]
+# excluded: ja, ko, th, vi, zh
+LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
+             "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
+             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
+             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
+             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
+             "tr", "tt", "uk", "ur", "xx", "yo"]
 # fmt: on
 
 

diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
@@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
     assert len(matches) == 1
 
 
+@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_valid_callback(en_vocab):
     """Test that on_match can only be None or callable."""
     matcher = Matcher(en_vocab)

diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
@@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
     assert texts == ["zero", "one", "two"]
 
 
+@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_remove():
     nlp = English()
     matcher = Matcher(nlp.vocab)