Merge pull request #11956 from adrianeboyd/backport/v3.4.4

Backport bug fixes to v3.4.x
explosion · Dec 14, 2022 · 77833bf · 77833bf
2 parents 63673a7 + 39ccd67
commit 77833bf
Show file tree

Hide file tree

Showing 19 changed files with 134 additions and 34 deletions.
diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
@@ -107,7 +107,7 @@ steps:
     displayName: "Run CPU tests"
 
   - script: |
-      python -m pip install --pre thinc-apple-ops
+      python -m pip install 'spacy[apple]'
       python -m pytest --pyargs spacy
     displayName: "Run CPU tests with thinc-apple-ops"
     condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -41,7 +41,7 @@ jobs:
       matrix:
         # We're only running one platform per Python version to speed up builds
         Python36Linux:
-          imageName: "ubuntu-latest"
+          imageName: "ubuntu-20.04"
           python.version: "3.6"
         #        Python36Windows:
         #          imageName: "windows-latest"
@@ -50,7 +50,7 @@ jobs:
         #          imageName: "macos-latest"
         #          python.version: "3.6"
         #        Python37Linux:
-        #          imageName: "ubuntu-latest"
+        #          imageName: "ubuntu-20.04"
         #          python.version: "3.7"
         Python37Windows:
           imageName: "windows-latest"

diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,7 @@ srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.8.0
 pathy>=0.3.5
+smart-open>=5.2.1,<7.0.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0

diff --git a/setup.cfg b/setup.cfg
@@ -53,6 +53,7 @@ install_requires =
     # Third-party dependencies
     typer>=0.3.0,<0.8.0
     pathy>=0.3.5
+    smart-open>=5.2.1,<7.0.0
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0
     requests>=2.13.0,<3.0.0

diff --git a/spacy/about.py b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.4.3"
+__version__ = "3.4.4"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
@@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
     if dest.exists() and not force:
         return None
     src = str(src)
-    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
+    with smart_open.open(src, mode="rb", compression="disable") as input_file:
         with dest.open(mode="wb") as output_file:
             shutil.copyfileobj(input_file, output_file)
 

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
@@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
 [paths]

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
@@ -228,12 +228,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
             "kb_id": span.kb_id_ if span.kb_id_ else "",
             "kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
         }
-        for span in doc.spans[spans_key]
+        for span in doc.spans.get(spans_key, [])
     ]
     tokens = [token.text for token in doc]
 
     if not spans:
-        warnings.warn(Warnings.W117.format(spans_key=spans_key))
+        keys = list(doc.spans.keys())
+        warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
     title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
     settings = get_doc_settings(doc)
     return {

diff --git a/spacy/errors.py b/spacy/errors.py
@@ -199,7 +199,7 @@ class Warnings(metaclass=ErrorsWithCodes):
     W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
             "surprising to you, make sure the Doc was processed using a model "
             "that supports span categorization, and check the `doc.spans[spans_key]` "
-            "property manually if necessary.")
+            "property manually if necessary.\n\nAvailable keys: {keys}")
     W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
             "for the corpora used to train the language. Please check "
             "`nlp.meta[\"sources\"]` for any relevant links.")
@@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
             "clear the existing vectors and resize the table.")
     E074 = ("Error interpreting compiled match pattern: patterns are expected "
             "to end with the attribute {attr}. Got: {bad_attr}.")
+    E079 = ("Error computing states in beam: number of predicted beams "
+            "({pbeams}) does not equal number of gold beams ({gbeams}).")
+    E080 = ("Duplicate state found in beam: {key}.")
+    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+            "does not equal number of losses ({losses}).")
     E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
             "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
             "match.")

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -328,9 +328,9 @@ def _add_labels(self, labels: Dict):
 
             tree = dict(tree)
             if "orig" in tree:
-                tree["orig"] = self.vocab.strings[tree["orig"]]
+                tree["orig"] = self.vocab.strings.add(tree["orig"])
             if "orig" in tree:
-                tree["subst"] = self.vocab.strings[tree["subst"]]
+                tree["subst"] = self.vocab.strings.add(tree["subst"])
 
             trees.append(tree)
 

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
@@ -272,7 +272,10 @@ def predict(self, docs: Iterable[Doc]):
         DOCS: https://spacy.io/api/spancategorizer#predict
         """
         indices = self.suggester(docs, ops=self.model.ops)
-        scores = self.model.predict((docs, indices))  # type: ignore
+        if indices.lengths.sum() == 0:
+            scores = self.model.ops.alloc2f(0, 0)
+        else:
+            scores = self.model.predict((docs, indices))  # type: ignore
         return indices, scores
 
     def set_candidates(

diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
@@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
 
     # head before start
     arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
     doc_from_array = Doc(en_vocab, words=words)
     with pytest.raises(ValueError):
         doc_from_array.from_array(["HEAD"], arr)
 
     # head after end
     arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
     doc_from_array = Doc(en_vocab, words=words)
     with pytest.raises(ValueError):
         doc_from_array.from_array(["HEAD"], arr)
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -60,10 +60,45 @@ def test_initialize_from_labels():
     nlp2 = Language()
     lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
     lemmatizer2.initialize(
-        get_examples=lambda: train_examples,
+        # We want to check that the strings in replacement nodes are
+        # added to the string store. Avoid that they get added through
+        # the examples.
+        get_examples=lambda: train_examples[:1],
         labels=lemmatizer.label_data,
     )
     assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
+    assert lemmatizer2.label_data == {
+        "trees": [
+            {"orig": "S", "subst": "s"},
+            {
+                "prefix_len": 1,
+                "suffix_len": 0,
+                "prefix_tree": 0,
+                "suffix_tree": 4294967295,
+            },
+            {"orig": "s", "subst": ""},
+            {
+                "prefix_len": 0,
+                "suffix_len": 1,
+                "prefix_tree": 4294967295,
+                "suffix_tree": 2,
+            },
+            {
+                "prefix_len": 0,
+                "suffix_len": 0,
+                "prefix_tree": 4294967295,
+                "suffix_tree": 4294967295,
+            },
+            {"orig": "E", "subst": "e"},
+            {
+                "prefix_len": 1,
+                "suffix_len": 0,
+                "prefix_tree": 5,
+                "suffix_tree": 4294967295,
+            },
+        ],
+        "labels": (1, 3, 4, 6),
+    }
 
 
 def test_no_data():

diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
@@ -372,31 +372,56 @@ def test_overfitting_IO_overlapping():
 
 
 def test_zero_suggestions():
-    # Test with a suggester that returns 0 suggestions
+    # Test with a suggester that can return 0 suggestions
 
-    @registry.misc("test_zero_suggester")
-    def make_zero_suggester():
-        def zero_suggester(docs, *, ops=None):
+    @registry.misc("test_mixed_zero_suggester")
+    def make_mixed_zero_suggester():
+        def mixed_zero_suggester(docs, *, ops=None):
             if ops is None:
                 ops = get_current_ops()
-            return Ragged(
-                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
-            )
-
-        return zero_suggester
+            spans = []
+            lengths = []
+            for doc in docs:
+                if len(doc) > 0 and len(doc) % 2 == 0:
+                    spans.append((0, 1))
+                    lengths.append(1)
+                else:
+                    lengths.append(0)
+            spans = ops.asarray2i(spans)
+            lengths_array = ops.asarray1i(lengths)
+            if len(spans) > 0:
+                output = Ragged(ops.xp.vstack(spans), lengths_array)
+            else:
+                output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+            return output
+
+        return mixed_zero_suggester
 
     fix_random_seed(0)
     nlp = English()
     spancat = nlp.add_pipe(
         "spancat",
-        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+        config={
+            "suggester": {"@misc": "test_mixed_zero_suggester"},
+            "spans_key": SPAN_KEY,
+        },
     )
     train_examples = make_examples(nlp)
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert spancat.model.get_dim("nO") == 2
     assert set(spancat.labels) == {"LOC", "PERSON"}
 
     nlp.update(train_examples, sgd=optimizer)
+    # empty doc
+    nlp("")
+    # single doc with zero suggestions
+    nlp("one")
+    # single doc with one suggestion
+    nlp("two two")
+    # batch with mixed zero/one suggestions
+    list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
+    # batch with no suggestions
+    list(nlp.pipe(["", "one", "three three three"]))
 
 
 def test_set_candidates():

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
@@ -16,6 +16,7 @@
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
+from spacy.cli._util import upload_file, download_file
 from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
@@ -896,3 +897,18 @@ def test_project_check_requirements(reqs, output):
         pkg_resources.require("spacyunknowndoesnotexist12345")
     except pkg_resources.DistributionNotFound:
         assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
+
+
+def test_upload_download_local_file():
+    with make_tempdir() as d1, make_tempdir() as d2:
+        filename = "f.txt"
+        content = "content"
+        local_file = d1 / filename
+        remote_file = d2 / filename
+        with local_file.open(mode="w") as file_:
+            file_.write(content)
+        upload_file(local_file, remote_file)
+        local_file.unlink()
+        download_file(remote_file, local_file)
+        with local_file.open(mode="r") as file_:
+            assert file_.read() == content
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
@@ -203,6 +203,16 @@ def test_displacy_parse_spans_different_spans_key(en_vocab):
     ]
 
 
+def test_displacy_parse_empty_spans_key(en_vocab):
+    """Test that having an unset spans key doesn't raise an error"""
+    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
+    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
+    with pytest.warns(UserWarning, match="W117"):
+        spans = displacy.parse_spans(doc)
+
+    assert isinstance(spans, dict)
+
+
 def test_displacy_parse_ents(en_vocab):
     """Test that named entities on a Doc are converted into displaCy's format."""
     doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
@@ -359,6 +359,7 @@ cdef class Doc:
             for annot in annotations:
                 if annot:
                     if annot is heads or annot is sent_starts or annot is ent_iobs:
+                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                         for i in range(len(words)):
                             if attrs.ndim == 1:
                                 attrs[i] = annot[i]
@@ -1558,6 +1559,7 @@ cdef class Doc:
 
             for j, (attr, annot) in enumerate(token_annotations.items()):
                 if attr is HEAD:
+                    annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                     for i in range(len(words)):
                         array[i, j] = annot[i]
                 elif attr is MORPH:

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
@@ -299,15 +299,15 @@ cdef class Span:
                     for ancestor in ancestors:
                         ancestor_i = ancestor.i - self.c.start
                         if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
 
                 # if there is no appropriate ancestor, define a new artificial root
                 value = array[i, head_col]
                 if (i+value) not in range(length):
                     new_root = old_to_new_root.get(ancestor_i, None)
                     if new_root is not None:
                         # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                     else:
                         # set this token as the new artificial root
                         array[i, head_col] = 0

diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
@@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
         if key not in IDS:
             raise ValueError(Errors.E974.format(obj="token", key=key))
         elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
         elif key == "HEAD":
             attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
         elif key == "DEP":
             attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
         elif key == "SENT_START":
             attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
         elif key == "MORPH":
             attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
         else:
             attrs.append(key)
             if not all(isinstance(v, str) for v in value):
                 types = set([type(v) for v in value])
                 raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
-    array = numpy.asarray(values, dtype="uint64")
+            row = [vocab.strings.add(v) for v in value]
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+    array = numpy.array(values, dtype=numpy.uint64)
     return attrs, array.T