-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add custom model loading * Improve edge case condition checks * Use __all__ for imports
- Loading branch information
1 parent
781dcb5
commit e683079
Showing
3 changed files
with
43 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
from .language import UDPipeLanguage, UDPipeModel, load | ||
from .language import UDPipeLanguage, UDPipeModel, load, load_from_path | ||
from .util import download | ||
|
||
__all__ = ["UDPipeLanguage", "UDPipeModel", | ||
"load", "load_from_path", "download"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,13 +18,27 @@ def load(lang): | |
mimicks spacy.load. | ||
lang (unicode): ISO 639-1 language code or shorthand UDPipe model name. | ||
RETURNS (spacy.language.Language): The UDPipeLanguage object. | ||
RETURNS (spacy.language.Language): The UDPipeLanguage object. | ||
""" | ||
model = UDPipeModel(lang) | ||
nlp = UDPipeLanguage(model) | ||
return nlp | ||
|
||
|
||
def load_from_path(lang, path, meta=None): | ||
"""Convenience function for initializing the Language class and loading | ||
a custom UDPipe model via the path argument. | ||
lang (unicode): ISO 639-1 language code. | ||
path (unicode): Path to the UDPipe model. | ||
meta (dict): Meta-information about the UDPipe model. | ||
RETURNS (spacy.language.Language): The UDPipeLanguage object. | ||
""" | ||
model = UDPipeModel(lang, path, meta) | ||
nlp = UDPipeLanguage(model) | ||
return nlp | ||
|
||
|
||
class UDPipeLanguage(Language): | ||
|
||
def __init__(self, udpipe_model, meta=None, **kwargs): | ||
|
@@ -93,7 +107,7 @@ def __call__(self, text): | |
udpipe_sents = self.model(text) if text else [Sentence()] | ||
text = " ".join(s.getText() for s in udpipe_sents) | ||
tokens, heads = self.get_tokens_with_heads(udpipe_sents) | ||
if not len(tokens): | ||
if not tokens: | ||
return Doc(self.vocab) | ||
|
||
words = [] | ||
|
@@ -186,32 +200,38 @@ def check_aligned(self, text, tokens): | |
|
||
class UDPipeModel: | ||
|
||
def __init__(self, lang): | ||
def __init__(self, lang, path=None, meta=None): | ||
"""Load UDPipe model for given language. | ||
lang (unicode): ISO 639-1 language code or shorthand UDPipe model name. | ||
path (unicode): Path to UDPipe model. | ||
meta (dict): Meta-information about the UDPipe model. | ||
RETURNS (UDPipeModel): Language specific UDPipeModel. | ||
""" | ||
path = get_path(lang) | ||
if path is None: | ||
path = get_path(lang) | ||
self.model = Model.load(path) | ||
if not self.model: | ||
if self.model is None: | ||
msg = "Cannot load UDPipe model from " \ | ||
"file '{}'".format(path) | ||
raise Exception(msg) | ||
self._lang = lang.split('-')[0] | ||
self._meta = {'authors': ("Milan Straka, " | ||
"Jana Straková"), | ||
'description': "UDPipe pretrained model.", | ||
'email': '[email protected]', | ||
'lang': 'udpipe_' + self._lang, | ||
'license': 'CC BY-NC-SA 4.0', | ||
'name': path.split('/')[-1], | ||
'parent_package': 'spacy_udpipe', | ||
'pipeline': 'Tokenizer, POS Tagger, Lemmatizer, Parser', | ||
'source': 'Universal Dependencies 2.4', | ||
'url': 'http://ufal.mff.cuni.cz/udpipe', | ||
'version': '1.2.0' | ||
} | ||
if meta is None: | ||
self._meta = {'authors': ("Milan Straka, " | ||
"Jana Straková"), | ||
'description': "UDPipe pretrained model.", | ||
'email': '[email protected]', | ||
'lang': 'udpipe_' + self._lang, | ||
'license': 'CC BY-NC-SA 4.0', | ||
'name': path.split('/')[-1], | ||
'parent_package': 'spacy_udpipe', | ||
'pipeline': 'Tokenizer, POS Tagger, Lemmatizer, Parser', | ||
'source': 'Universal Dependencies 2.4', | ||
'url': 'http://ufal.mff.cuni.cz/udpipe', | ||
'version': '1.2.0' | ||
} | ||
else: | ||
self._meta = meta | ||
|
||
def __call__(self, text): | ||
"""Tokenize, tag and parse the text and return it in an UDPipe | ||
|