Skip to content

Commit

Permalink
Merge pull request #11956 from adrianeboyd/backport/v3.4.4
Browse files Browse the repository at this point in the history
Backport bug fixes to v3.4.x
  • Loading branch information
adrianeboyd authored Dec 14, 2022
2 parents 63673a7 + 39ccd67 commit 77833bf
Show file tree
Hide file tree
Showing 19 changed files with 134 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .github/azure-steps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ steps:
displayName: "Run CPU tests"
- script: |
python -m pip install --pre thinc-apple-ops
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
Expand Down
4 changes: 2 additions & 2 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
imageName: "ubuntu-latest"
imageName: "ubuntu-20.04"
python.version: "3.6"
# Python36Windows:
# imageName: "windows-latest"
Expand All @@ -50,7 +50,7 @@ jobs:
# imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
# imageName: "ubuntu-latest"
# imageName: "ubuntu-20.04"
# python.version: "3.7"
Python37Windows:
imageName: "windows-latest"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0
pathy>=0.3.5
smart-open>=5.2.1,<7.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ install_requires =
# Third-party dependencies
typer>=0.3.0,<0.8.0
pathy>=0.3.5
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
Expand Down
2 changes: 1 addition & 1 deletion spacy/about.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.4.3"
__version__ = "3.4.4"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force:
return None
src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file:
shutil.copyfileobj(input_file, output_file)

Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/templates/quickstart_training.jinja
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" -%}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
[paths]
Expand Down
5 changes: 3 additions & 2 deletions spacy/displacy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"kb_id": span.kb_id_ if span.kb_id_ else "",
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
}
for span in doc.spans[spans_key]
for span in doc.spans.get(spans_key, [])
]
tokens = [token.text for token in doc]

if not spans:
warnings.warn(Warnings.W117.format(spans_key=spans_key))
keys = list(doc.spans.keys())
warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
settings = get_doc_settings(doc)
return {
Expand Down
7 changes: 6 additions & 1 deletion spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ class Warnings(metaclass=ErrorsWithCodes):
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
"surprising to you, make sure the Doc was processed using a model "
"that supports span categorization, and check the `doc.spans[spans_key]` "
"property manually if necessary.")
"property manually if necessary.\n\nAvailable keys: {keys}")
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
"for the corpora used to train the language. Please check "
"`nlp.meta[\"sources\"]` for any relevant links.")
Expand Down Expand Up @@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
Expand Down
4 changes: 2 additions & 2 deletions spacy/pipeline/edit_tree_lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,9 @@ def _add_labels(self, labels: Dict):

tree = dict(tree)
if "orig" in tree:
tree["orig"] = self.vocab.strings[tree["orig"]]
tree["orig"] = self.vocab.strings.add(tree["orig"])
if "orig" in tree:
tree["subst"] = self.vocab.strings[tree["subst"]]
tree["subst"] = self.vocab.strings.add(tree["subst"])

trees.append(tree)

Expand Down
5 changes: 4 additions & 1 deletion spacy/pipeline/spancat.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,10 @@ def predict(self, docs: Iterable[Doc]):
DOCS: https://spacy.io/api/spancategorizer#predict
"""
indices = self.suggester(docs, ops=self.model.ops)
scores = self.model.predict((docs, indices)) # type: ignore
if indices.lengths.sum() == 0:
scores = self.model.ops.alloc2f(0, 0)
else:
scores = self.model.predict((docs, indices)) # type: ignore
return indices, scores

def set_candidates(
Expand Down
4 changes: 2 additions & 2 deletions spacy/tests/doc/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):

# head before start
arr = doc.to_array(["HEAD"])
arr[0] = -1
arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)

# head after end
arr = doc.to_array(["HEAD"])
arr[0] = 5
arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
37 changes: 36 additions & 1 deletion spacy/tests/pipeline/test_edit_tree_lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,45 @@ def test_initialize_from_labels():
nlp2 = Language()
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
lemmatizer2.initialize(
get_examples=lambda: train_examples,
# We want to check that the strings in replacement nodes are
# added to the string store. Avoid that they get added through
# the examples.
get_examples=lambda: train_examples[:1],
labels=lemmatizer.label_data,
)
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
assert lemmatizer2.label_data == {
"trees": [
{"orig": "S", "subst": "s"},
{
"prefix_len": 1,
"suffix_len": 0,
"prefix_tree": 0,
"suffix_tree": 4294967295,
},
{"orig": "s", "subst": ""},
{
"prefix_len": 0,
"suffix_len": 1,
"prefix_tree": 4294967295,
"suffix_tree": 2,
},
{
"prefix_len": 0,
"suffix_len": 0,
"prefix_tree": 4294967295,
"suffix_tree": 4294967295,
},
{"orig": "E", "subst": "e"},
{
"prefix_len": 1,
"suffix_len": 0,
"prefix_tree": 5,
"suffix_tree": 4294967295,
},
],
"labels": (1, 3, 4, 6),
}


def test_no_data():
Expand Down
45 changes: 35 additions & 10 deletions spacy/tests/pipeline/test_spancat.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,31 +372,56 @@ def test_overfitting_IO_overlapping():


def test_zero_suggestions():
# Test with a suggester that returns 0 suggestions
# Test with a suggester that can return 0 suggestions

@registry.misc("test_zero_suggester")
def make_zero_suggester():
def zero_suggester(docs, *, ops=None):
@registry.misc("test_mixed_zero_suggester")
def make_mixed_zero_suggester():
def mixed_zero_suggester(docs, *, ops=None):
if ops is None:
ops = get_current_ops()
return Ragged(
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
)

return zero_suggester
spans = []
lengths = []
for doc in docs:
if len(doc) > 0 and len(doc) % 2 == 0:
spans.append((0, 1))
lengths.append(1)
else:
lengths.append(0)
spans = ops.asarray2i(spans)
lengths_array = ops.asarray1i(lengths)
if len(spans) > 0:
output = Ragged(ops.xp.vstack(spans), lengths_array)
else:
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
return output

return mixed_zero_suggester

fix_random_seed(0)
nlp = English()
spancat = nlp.add_pipe(
"spancat",
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
config={
"suggester": {"@misc": "test_mixed_zero_suggester"},
"spans_key": SPAN_KEY,
},
)
train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert spancat.model.get_dim("nO") == 2
assert set(spancat.labels) == {"LOC", "PERSON"}

nlp.update(train_examples, sgd=optimizer)
# empty doc
nlp("")
# single doc with zero suggestions
nlp("one")
# single doc with one suggestion
nlp("two two")
# batch with mixed zero/one suggestions
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
# batch with no suggestions
list(nlp.pipe(["", "one", "three three three"]))


def test_set_candidates():
Expand Down
16 changes: 16 additions & 0 deletions spacy/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands
from spacy.cli._util import upload_file, download_file
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
Expand Down Expand Up @@ -896,3 +897,18 @@ def test_project_check_requirements(reqs, output):
pkg_resources.require("spacyunknowndoesnotexist12345")
except pkg_resources.DistributionNotFound:
assert output == _check_requirements([req.strip() for req in reqs.split("\n")])


def test_upload_download_local_file():
with make_tempdir() as d1, make_tempdir() as d2:
filename = "f.txt"
content = "content"
local_file = d1 / filename
remote_file = d2 / filename
with local_file.open(mode="w") as file_:
file_.write(content)
upload_file(local_file, remote_file)
local_file.unlink()
download_file(remote_file, local_file)
with local_file.open(mode="r") as file_:
assert file_.read() == content
10 changes: 10 additions & 0 deletions spacy/tests/test_displacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,16 @@ def test_displacy_parse_spans_different_spans_key(en_vocab):
]


def test_displacy_parse_empty_spans_key(en_vocab):
"""Test that having an unset spans key doesn't raise an error"""
doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
with pytest.warns(UserWarning, match="W117"):
spans = displacy.parse_spans(doc)

assert isinstance(spans, dict)


def test_displacy_parse_ents(en_vocab):
"""Test that named entities on a Doc are converted into displaCy's format."""
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
Expand Down
2 changes: 2 additions & 0 deletions spacy/tokens/doc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ cdef class Doc:
for annot in annotations:
if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = annot[i]
Expand Down Expand Up @@ -1558,6 +1559,7 @@ cdef class Doc:

for j, (attr, annot) in enumerate(token_annotations.items()):
if attr is HEAD:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
array[i, j] = annot[i]
elif attr is MORPH:
Expand Down
4 changes: 2 additions & 2 deletions spacy/tokens/span.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -299,15 +299,15 @@ cdef class Span:
for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)

# if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col]
if (i+value) not in range(length):
new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None:
# take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else:
# set this token as the new artificial root
array[i, head_col] = 0
Expand Down
15 changes: 8 additions & 7 deletions spacy/training/example.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]:
pass
continue
elif key == "HEAD":
attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP":
attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START":
attrs.append(key)
values.append([to_ternary_int(v) for v in value])
row = [to_ternary_int(v) for v in value]
elif key == "MORPH":
attrs.append(key)
values.append([vocab.morphology.add(v) for v in value])
row = [vocab.morphology.add(v) for v in value]
else:
attrs.append(key)
if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value])
array = numpy.asarray(values, dtype="uint64")
row = [vocab.strings.add(v) for v in value]
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T


Expand Down

0 comments on commit 77833bf

Please sign in to comment.