From e12287fce111335b2bbe2f3e7c8429e6b1f62385 Mon Sep 17 00:00:00 2001 From: asajatovic Date: Mon, 14 Oct 2019 10:50:41 +0200 Subject: [PATCH] Whitespaces fix (#5) * Fix DE tokenization * Update version --- setup.py | 2 +- spacy_udpipe/language.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index f74d3de..3115bd5 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setuptools.setup( name="spacy-udpipe", - version="0.0.3", + version="0.0.4", description="Use fast UDPipe models directly in spaCy", long_description=long_description, long_description_content_type="text/markdown", diff --git a/spacy_udpipe/language.py b/spacy_udpipe/language.py index 201272c..7cc24e0 100644 --- a/spacy_udpipe/language.py +++ b/spacy_udpipe/language.py @@ -131,11 +131,11 @@ def __call__(self, text): pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank tags.append(self.vocab.strings.add(token.xpostag or "")) - deps.append(self.vocab.strings.add(self.__dep(token.deprel) or "")) + deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] - if i == len(tokens) - 1: + if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) @@ -156,7 +156,7 @@ def __call__(self, text): doc.is_parsed = True return doc - def __dep(self, dep): + def _dep(self, dep): # Ensure labels match with SpaCy return 'ROOT' if dep == 'root' else dep