unicode version of word_splitter and stop_word_regex (#34)

* unicode version of word_splitter and stop_word_regex * added missing whitespace. fixes flake warning
fabianvf · Dec 4, 2017 · 8aa4a82 · 8aa4a82
1 parent 04fd01a
commit 8aa4a82
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/RAKE/RAKE.py b/RAKE/RAKE.py
@@ -64,7 +64,7 @@ def separate_words(text):
     @param text The text that must be split in to words.
     @param min_word_return_size The minimum no of characters a word must have to be included.
     """
-    splitter = re.compile('\W+')
+    splitter = re.compile('(?u)\W+')
     words = []
     for single_word in splitter.split(text):
         current_word = single_word.strip().lower()
@@ -89,7 +89,7 @@ def build_stop_word_regex(stop_word_list):
     for word in stop_word_list:
         word_regex = r'\b' + word + r'(?![\w-])'
         stop_word_regex_list.append(word_regex)
-    return re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
+    return re.compile('(?u)' + '|'.join(stop_word_regex_list), re.IGNORECASE)
 
 
 def generate_candidate_keywords(sentence_list, stop_word_pattern, minCharacters, maxWords):