From e6830795587def1e18972ddeb28ce30a556e083f Mon Sep 17 00:00:00 2001 From: asajatovic Date: Thu, 8 Aug 2019 17:03:29 +0200 Subject: [PATCH] Add custom model loading (#2) * Add custom model loading * Improve edge case condition checks * Use __all__ for imports --- setup.py | 2 +- spacy_udpipe/__init__.py | 5 +++- spacy_udpipe/language.py | 56 +++++++++++++++++++++++++++------------- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/setup.py b/setup.py index b8f9899..c690e30 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setuptools.setup( name="spacy-udpipe", - version="0.0.1", + version="0.0.2", description="Use fast UDPipe models directly in spaCy", long_description=long_description, long_description_content_type="text/markdown", diff --git a/spacy_udpipe/__init__.py b/spacy_udpipe/__init__.py index 1f8eff5..d483ca5 100644 --- a/spacy_udpipe/__init__.py +++ b/spacy_udpipe/__init__.py @@ -1,2 +1,5 @@ -from .language import UDPipeLanguage, UDPipeModel, load +from .language import UDPipeLanguage, UDPipeModel, load, load_from_path from .util import download + +__all__ = ["UDPipeLanguage", "UDPipeModel", + "load", "load_from_path", "download"] diff --git a/spacy_udpipe/language.py b/spacy_udpipe/language.py index af7be94..201272c 100644 --- a/spacy_udpipe/language.py +++ b/spacy_udpipe/language.py @@ -18,13 +18,27 @@ def load(lang): mimicks spacy.load. lang (unicode): ISO 639-1 language code or shorthand UDPipe model name. - RETURNS (spacy.language.Language): The UDPipeLanguage object. + RETURNS (spacy.language.Language): The UDPipeLanguage object. """ model = UDPipeModel(lang) nlp = UDPipeLanguage(model) return nlp +def load_from_path(lang, path, meta=None): + """Convenience function for initializing the Language class and loading + a custom UDPipe model via the path argument. + + lang (unicode): ISO 639-1 language code. + path (unicode): Path to the UDPipe model. + meta (dict): Meta-information about the UDPipe model. + RETURNS (spacy.language.Language): The UDPipeLanguage object. + """ + model = UDPipeModel(lang, path, meta) + nlp = UDPipeLanguage(model) + return nlp + + class UDPipeLanguage(Language): def __init__(self, udpipe_model, meta=None, **kwargs): @@ -93,7 +107,7 @@ def __call__(self, text): udpipe_sents = self.model(text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self.get_tokens_with_heads(udpipe_sents) - if not len(tokens): + if not tokens: return Doc(self.vocab) words = [] @@ -186,32 +200,38 @@ def check_aligned(self, text, tokens): class UDPipeModel: - def __init__(self, lang): + def __init__(self, lang, path=None, meta=None): """Load UDPipe model for given language. lang (unicode): ISO 639-1 language code or shorthand UDPipe model name. + path (unicode): Path to UDPipe model. + meta (dict): Meta-information about the UDPipe model. RETURNS (UDPipeModel): Language specific UDPipeModel. """ - path = get_path(lang) + if path is None: + path = get_path(lang) self.model = Model.load(path) - if not self.model: + if self.model is None: msg = "Cannot load UDPipe model from " \ "file '{}'".format(path) raise Exception(msg) self._lang = lang.split('-')[0] - self._meta = {'authors': ("Milan Straka, " - "Jana Straková"), - 'description': "UDPipe pretrained model.", - 'email': 'straka@ufal.mff.cuni.cz', - 'lang': 'udpipe_' + self._lang, - 'license': 'CC BY-NC-SA 4.0', - 'name': path.split('/')[-1], - 'parent_package': 'spacy_udpipe', - 'pipeline': 'Tokenizer, POS Tagger, Lemmatizer, Parser', - 'source': 'Universal Dependencies 2.4', - 'url': 'http://ufal.mff.cuni.cz/udpipe', - 'version': '1.2.0' - } + if meta is None: + self._meta = {'authors': ("Milan Straka, " + "Jana Straková"), + 'description': "UDPipe pretrained model.", + 'email': 'straka@ufal.mff.cuni.cz', + 'lang': 'udpipe_' + self._lang, + 'license': 'CC BY-NC-SA 4.0', + 'name': path.split('/')[-1], + 'parent_package': 'spacy_udpipe', + 'pipeline': 'Tokenizer, POS Tagger, Lemmatizer, Parser', + 'source': 'Universal Dependencies 2.4', + 'url': 'http://ufal.mff.cuni.cz/udpipe', + 'version': '1.2.0' + } + else: + self._meta = meta def __call__(self, text): """Tokenize, tag and parse the text and return it in an UDPipe