From d408d738b51aee13edd1b3c5bcd5b7af39f49c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=BB=D1=8C=D1=8F=20=D0=9B=D0=B5=D0=B1=D0=B5=D0=B4?= =?UTF-8?q?=D0=B5=D0=B2?= Date: Mon, 29 Jul 2019 20:02:18 +0300 Subject: [PATCH] tune regexp to match complex words --- rozental_as_a_service/__init__.py | 2 +- rozental_as_a_service/rozental.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rozental_as_a_service/__init__.py b/rozental_as_a_service/__init__.py index 9123cf0..9d1ffab 100644 --- a/rozental_as_a_service/__init__.py +++ b/rozental_as_a_service/__init__.py @@ -1 +1 @@ -__version__ = '0.0.8' +__version__ = '0.0.9' diff --git a/rozental_as_a_service/rozental.py b/rozental_as_a_service/rozental.py index 1bdbeda..7ac9701 100644 --- a/rozental_as_a_service/rozental.py +++ b/rozental_as_a_service/rozental.py @@ -107,7 +107,7 @@ def extract_words(raw_constants: List[str], min_word_length: int = 3, only_russi processed_words: List[str] = [] for constant in raw_constants: processed_words += list({ - w.strip().lower() for w in re.findall(r'\w+', constant) + w.strip().lower() for w in re.findall(r'[\w-]+', constant) if len(w.strip()) >= min_word_length }) processed_words = list(set(processed_words))