Skip to content

Commit

Permalink
unicode version of word_splitter and stop_word_regex (#34)
Browse files Browse the repository at this point in the history
* unicode version of word_splitter and stop_word_regex

* added missing whitespace. fixes flake warning
  • Loading branch information
jbernau authored and fabianvf committed Dec 4, 2017
1 parent 04fd01a commit 8aa4a82
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions RAKE/RAKE.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def separate_words(text):
@param text The text that must be split in to words.
@param min_word_return_size The minimum no of characters a word must have to be included.
"""
splitter = re.compile('\W+')
splitter = re.compile('(?u)\W+')
words = []
for single_word in splitter.split(text):
current_word = single_word.strip().lower()
Expand All @@ -89,7 +89,7 @@ def build_stop_word_regex(stop_word_list):
for word in stop_word_list:
word_regex = r'\b' + word + r'(?![\w-])'
stop_word_regex_list.append(word_regex)
return re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
return re.compile('(?u)' + '|'.join(stop_word_regex_list), re.IGNORECASE)


def generate_candidate_keywords(sentence_list, stop_word_pattern, minCharacters, maxWords):
Expand Down

0 comments on commit 8aa4a82

Please sign in to comment.