moving word normalization to this repo

seperman · Jan 30, 2019 · 7f1cedf · 7f1cedf
1 parent 547b6c3
commit 7f1cedf
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 4 deletions.
diff --git a/fast_autocomplete/__init__.py b/fast_autocomplete/__init__.py
@@ -1,5 +1,5 @@
 # flake8: noqa
-__version__ = '0.2.3'
+__version__ = '0.2.4'
 import sys
 pyversion = float(sys.version[:3])
 if pyversion < 3.6:
@@ -8,3 +8,4 @@
 from fast_autocomplete.dwg import AutoComplete
 from fast_autocomplete.draw import DrawGraphMixin
 from fast_autocomplete.demo import demo
+from fast_autocomplete.normalize import normalize_node_name
diff --git a/fast_autocomplete/dwg.py b/fast_autocomplete/dwg.py
@@ -4,6 +4,7 @@
 from collections import defaultdict
 from fast_autocomplete.lfucache import LFUCache
 from fast_autocomplete.misc import _extend_and_repeat
+from fast_autocomplete.normalize import normalize_node_name
 from Levenshtein import distance as levenshtein_distance
 
 DELIMITER = '__'
@@ -174,7 +175,7 @@ def search(self, word, max_cost=2, size=5):
         - max_cost: Maximum Levenshtein edit distance to be considered when calculating results
         - size: The max number of results to return
         """
-        word = word.lower().strip()
+        word = normalize_node_name(word)
         if not word:
             return []
         key = f'{word}-{max_cost}-{size}'
@@ -203,10 +204,11 @@ def _find(self, word, max_cost, size, call_count=0):
         fuzzy_matches_len = 0
 
         fuzzy_min_distance = min_distance = INF
-        # if word == 'mercedes s':
-        #     import pytest; pytest.set_trace()
         matched_prefix_of_last_word, rest_of_word, new_node, matched_words = self._prefix_autofill(word=word)
 
+        if matched_words and matched_words[-1] == 'bmw' and not rest_of_word:
+            print('!!!!!!')
+
         last_word = matched_prefix_of_last_word + rest_of_word
 
         if matched_words:

diff --git a/fast_autocomplete/normalize.py b/fast_autocomplete/normalize.py
@@ -0,0 +1,36 @@
+import string
+from fast_autocomplete.lfucache import LFUCache
+
+valid_chars_for_string = {i for i in string.ascii_letters.lower()}
+valid_chars_for_integer = {i for i in string.digits}
+valid_chars_for_node_name = {' ', '-'} | valid_chars_for_string | valid_chars_for_integer
+
+NORMALIZED_CACHE_SIZE = 2048
+MAX_WORD_LENGTH = 40
+
+_normalized_lfu_cache = LFUCache(NORMALIZED_CACHE_SIZE)
+
+
+def normalize_node_name(name):
+    name = name[:MAX_WORD_LENGTH]
+    result = _normalized_lfu_cache.get(name)
+    if result == -1:
+        result = _get_normalized_node_name(name)
+        _normalized_lfu_cache.set(name, result)
+    return result
+
+
+def _get_normalized_node_name(name):
+    name = name.lower()
+    result = []
+    last_i = None
+    for i in name:
+        if i in valid_chars_for_node_name:
+            if i == '-':
+                i = ' '
+            elif (i in valid_chars_for_integer and last_i in valid_chars_for_string) or (i in valid_chars_for_string and last_i in valid_chars_for_integer):
+                result.append(' ')
+            if not(i == last_i == ' '):
+                result.append(i)
+                last_i = i
+    return ''.join(result).strip()