Skip to content

Commit

Permalink
Merge pull request #8787 from adrianeboyd/chore/backport-v3.0.7
Browse files Browse the repository at this point in the history
Backport bug fixes to v3.0.x
  • Loading branch information
adrianeboyd authored Jul 21, 2021
2 parents 02e1892 + 0080454 commit 034ac0a
Show file tree
Hide file tree
Showing 33 changed files with 209 additions and 105 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json *.yml
recursive-include licenses *
recursive-exclude spacy *.cpp
2 changes: 1 addition & 1 deletion spacy/about.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.0.6"
__version__ = "3.0.7"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
Expand Down
3 changes: 2 additions & 1 deletion spacy/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ def convert(
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
doc_files = []
for input_loc in walk_directory(Path(input_path), converter):
input_data = input_loc.open("r", encoding="utf-8").read()
with input_loc.open("r", encoding="utf-8") as infile:
input_data = infile.read()
# Use converter function to convert data
func = CONVERTERS[converter]
docs = func(
Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def package_cli(
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/templates/quickstart_training.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ compound = 1.001

[initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
vectors = ${paths.vectors}
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
5 changes: 5 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,11 @@ class Errors:
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")

# New errors added in v3.x
E867 = ("The 'textcat' component requires at least two labels because it "
"uses mutually exclusive classes where exactly one label is True "
"for each doc. For binary classification tasks, you can use two "
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
"can use the 'textcat_multilabel' component with one label.")
E870 = ("Could not serialize the DocBin because it is too large. Consider "
"splitting up your documents into several doc bins and serializing "
"each separately. spacy.Corpus.v1 will search recursively for all "
Expand Down
5 changes: 0 additions & 5 deletions spacy/lang/az/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .lex_attrs import LEX_ATTRS
from ...language import Language


class AzerbaijaniDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
token_match = TOKEN_MATCH
syntax_iterators = SYNTAX_ITERATORS


class Azerbaijani(Language):
Expand Down
2 changes: 1 addition & 1 deletion spacy/lang/el/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,6 @@ def rule_lemmatize(self, token: Token) -> List[str]:
forms.extend(oov_forms)
if not forms:
forms.append(string)
forms = list(set(forms))
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms
5 changes: 2 additions & 3 deletions spacy/lang/ru/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@


class RussianLemmatizer(Lemmatizer):
_morph = None

def __init__(
self,
Expand All @@ -31,8 +30,8 @@ def __init__(
"The Russian lemmatizer mode 'pymorphy2' requires the "
"pymorphy2 library. Install it with: pip install pymorphy2"
) from None
if RussianLemmatizer._morph is None:
RussianLemmatizer._morph = MorphAnalyzer()
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)

def pymorphy2_lemmatize(self, token: Token) -> List[str]:
Expand Down
6 changes: 2 additions & 4 deletions spacy/lang/uk/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@


class UkrainianLemmatizer(RussianLemmatizer):
_morph = None

def __init__(
self,
vocab: Vocab,
Expand All @@ -27,6 +25,6 @@ def __init__(
"pymorphy2 library and dictionaries. Install them with: "
"pip install pymorphy2 pymorphy2-dicts-uk"
) from None
if UkrainianLemmatizer._morph is None:
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
2 changes: 2 additions & 0 deletions spacy/matcher/phrasematcher.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ cdef class PhraseMatcher:
if isinstance(attr, (int, long)):
self.attr = attr
else:
if attr is None:
attr = "ORTH"
attr = attr.upper()
if attr == "TEXT":
attr = "ORTH"
Expand Down
3 changes: 2 additions & 1 deletion spacy/ml/models/multi_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from thinc.api import MultiSoftmax, list2array
from thinc.api import to_categorical, CosineDistance, L2Distance

from ...util import registry
from ...util import registry, OOV_RANK
from ...errors import Errors
from ...attrs import ID

Expand Down Expand Up @@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
target[ids == OOV_RANK] = 0
d_target, loss = distance(prediction, target)
return loss, d_target

Expand Down
3 changes: 2 additions & 1 deletion spacy/pipeline/entity_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,8 @@ def from_disk(

def load_model(p):
try:
self.model.from_bytes(p.open("rb").read())
with p.open("rb") as infile:
self.model.from_bytes(infile.read())
except AttributeError:
raise ValueError(Errors.E149) from None

Expand Down
46 changes: 22 additions & 24 deletions spacy/pipeline/entityruler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
from pathlib import Path
import srsly
import warnings

from .pipe import Pipe
from ..training import Example
Expand Down Expand Up @@ -102,17 +103,12 @@ def __init__(
self.overwrite = overwrite_ents
self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list)
self._validate = validate
self.matcher = Matcher(nlp.vocab, validate=validate)
if phrase_matcher_attr is not None:
if phrase_matcher_attr.upper() == "TEXT":
phrase_matcher_attr = "ORTH"
self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
)
else:
self.phrase_matcher_attr = None
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
)
self.ent_id_sep = ent_id_sep
self._ent_ids = defaultdict(dict)
if patterns is not None:
Expand Down Expand Up @@ -146,7 +142,9 @@ def __call__(self, doc: Doc) -> Doc:

def match(self, doc: Doc):
self._require_patterns()
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W036")
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end]
)
Expand Down Expand Up @@ -281,7 +279,7 @@ def add_patterns(self, patterns: List[PatternType]) -> None:
current_index = i
break
subsequent_pipes = [
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
pipe for pipe in self.nlp.pipe_names[current_index :]
]
except ValueError:
subsequent_pipes = []
Expand Down Expand Up @@ -317,20 +315,22 @@ def add_patterns(self, patterns: List[PatternType]) -> None:
pattern = entry["pattern"]
if isinstance(pattern, Doc):
self.phrase_patterns[label].append(pattern)
self.phrase_matcher.add(label, [pattern])
elif isinstance(pattern, list):
self.token_patterns[label].append(pattern)
self.matcher.add(label, [pattern])
else:
raise ValueError(Errors.E097.format(pattern=pattern))
for label, patterns in self.token_patterns.items():
self.matcher.add(label, patterns)
for label, patterns in self.phrase_patterns.items():
self.phrase_matcher.add(label, patterns)

def clear(self) -> None:
"""Reset all patterns."""
self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list)
self._ent_ids = defaultdict(dict)
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
)

def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
Expand Down Expand Up @@ -381,10 +381,9 @@ def from_bytes(
self.add_patterns(cfg.get("patterns", cfg))
self.overwrite = cfg.get("overwrite", False)
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
if self.phrase_matcher_attr is not None:
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
else:
self.add_patterns(cfg)
Expand Down Expand Up @@ -435,10 +434,9 @@ def from_disk(
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)

if self.phrase_matcher_attr is not None:
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
from_disk(path, deserializers_patterns, {})
return self

Expand Down
2 changes: 2 additions & 0 deletions spacy/pipeline/textcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ def initialize(
else:
for label in labels:
self.add_label(label)
if len(self.labels) < 2:
raise ValueError(Errors.E867)
if positive_label is not None:
if positive_label not in self.labels:
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
Expand Down
3 changes: 2 additions & 1 deletion spacy/pipeline/trainable_pipe.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
def load_model(p):
try:
self.model.from_bytes(p.open("rb").read())
with open(p, "rb") as mfile:
self.model.from_bytes(mfile.read())
except AttributeError:
raise ValueError(Errors.E149) from None
Expand Down
40 changes: 30 additions & 10 deletions spacy/tests/doc/test_doc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,17 +351,25 @@ def test_doc_from_array_morph(en_vocab):

@pytest.mark.usefixtures("clean_underscore")
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
en_texts = [
"Merging the docs is fun.",
"",
"They don't think alike. ",
"Another doc.",
]
en_texts_without_empty = [t for t in en_texts if len(t)]
de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]]
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
en_docs[3].spans["group"] = [en_docs[3][0:1]]
span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
)
de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False)
en_docs[0][2]._.is_ambiguous = True # docs
en_docs[2][3]._.is_ambiguous = True # think
en_docs[0][2]._.is_ambiguous = True # docs
en_docs[2][3]._.is_ambiguous = True # think
assert Doc.from_docs([]) is None
assert de_doc is not Doc.from_docs([de_doc])
assert str(de_doc) == str(Doc.from_docs([de_doc]))
Expand All @@ -371,8 +379,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):

m_doc = Doc.from_docs(en_docs)
assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts_without_empty)
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
Expand All @@ -384,11 +392,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)

m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
assert str(m_doc) == "".join(en_texts)
assert len(m_doc.text) == sum(len(t) for t in en_texts)
assert m_doc.text == "".join(en_texts_without_empty)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
Expand All @@ -397,11 +406,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert m_doc[9].idx == think_idx
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)

m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
# space delimiter considered, although spacy attribute was missing
assert str(m_doc) == " ".join(en_texts_without_empty)
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
Expand All @@ -414,6 +424,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
# can merge empty docs
doc = Doc.from_docs([en_tokenizer("")] * 10)

# empty but set spans keys are preserved
en_docs = [en_tokenizer(text) for text in en_texts]
m_doc = Doc.from_docs(en_docs)
assert "group" not in m_doc.spans
for doc in en_docs:
doc.spans["group"] = []
m_doc = Doc.from_docs(en_docs)
assert "group" in m_doc.spans
assert len(m_doc.spans["group"]) == 0


def test_doc_api_from_docs_ents(en_tokenizer):
texts = ["Merging the docs is fun.", "They don't think alike."]
Expand Down
13 changes: 7 additions & 6 deletions spacy/tests/lang/test_initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

# fmt: off
# Only include languages with no external dependencies
# excluded: ja, ru, th, uk, vi, zh
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
"yo"]
# excluded: ja, ko, th, vi, zh
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "xx", "yo"]
# fmt: on


Expand Down
1 change: 1 addition & 0 deletions spacy/tests/matcher/test_matcher_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
assert len(matches) == 1


@pytest.mark.filterwarnings("ignore:\\[W036")
def test_matcher_valid_callback(en_vocab):
"""Test that on_match can only be None or callable."""
matcher = Matcher(en_vocab)
Expand Down
1 change: 1 addition & 0 deletions spacy/tests/matcher/test_matcher_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
assert texts == ["zero", "one", "two"]


@pytest.mark.filterwarnings("ignore:\\[W036")
def test_matcher_remove():
nlp = English()
matcher = Matcher(nlp.vocab)
Expand Down
Loading

0 comments on commit 034ac0a

Please sign in to comment.