diff --git a/README.md b/README.md index 56010e6..c9d1d89 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Fast Autocomplete 0.1.4 +# Fast Autocomplete 0.1.5 -Fast autocomplete using Directed Acyclic Word Graph (DAWG) and Levenshtein Edit Distance. +Fast autocomplete using Directed Word Graph (DWG) and Levenshtein Edit Distance. The results are cached via LFU (Least Frequently Used). @@ -21,7 +21,7 @@ You might say: In a nutshell, what the fast Autocomplete does is: -1. Populate the DAWG with your words. +1. Populate the DWG with your words. 2. Follow the graph nodes letter by letter until it finds nodes that have words in them. 3. Continue after words are found on the graph until it reaches the leaf node. 4. Restart from the root node again until it reaches a letter that doesn't exist on the graph. @@ -46,15 +46,15 @@ Are you still on Python 2? TIME TO UPGRADE. MIT -# DAWG +# DWG The data structure we use in this library is called Dawg. -DAWG stands for Directed Acyclic Word Graph. Here is an example DAWG based on the "makes_models_short.csv" that is provided in the tests: +DWG stands for Directed Word Graph. Here is an example DWG based on the "makes_models_short.csv" that is provided in the tests: -![dawg](tests/animation/short.gif) +![dwg](tests/animation/short.gif) -![dawg](tests/AutoCompleteWithSynonymsShort_Graph.svg) +![dwg](tests/AutoCompleteWithSynonymsShort_Graph.svg) # Usage @@ -152,7 +152,7 @@ from fast_autocomplete import AutoComplete autocomplete = AutoComplete(words=words, synonyms=synonyms) ``` -At this point, AutoComplete has created a [dawg](#DAWG) structure. +At this point, AutoComplete has created a [dwg](#DWG) structure. Now you can search! @@ -228,11 +228,11 @@ converted to contexts: ## Draw -This package can actually draw the dawgs as it is populating them or just once the dawg is populated for you! -Here is the animation of populating the dawg with words from "makes_models_short.csv": +This package can actually draw the dwgs as it is populating them or just once the dwg is populated for you! +Here is the animation of populating the dwg with words from "makes_models_short.csv": -### Draw animation of dawg populating +### Draw animation of dwg populating ```py from fast_autocomplete import AutoComplete, DrawGraphMixin @@ -247,14 +247,14 @@ class AutoCompleteDraw(DrawGraphMixin, AutoComplete): autocomplete = AutoCompleteDraw(words=words, synonyms=synonyms) ``` -As soon as you initialize the above AutoCompleteDraw class, it will populate the dawg and generate the animation! -For an example of this code properly setup, take a look at the tests. In fact the animation in the [dawg](#dawg) section is generated the same way via unit tests! +As soon as you initialize the above AutoCompleteDraw class, it will populate the dwg and generate the animation! +For an example of this code properly setup, take a look at the tests. In fact the animation in the [dwg](#dwg) section is generated the same way via unit tests! -Note that if you have many words, the graph file will be big. Instead of drawing all frames as the dawg is being populated, you can just draw the final stage: +Note that if you have many words, the graph file will be big. Instead of drawing all frames as the dwg is being populated, you can just draw the final stage: ### Draw the final graph -To draw just one graph that shows the final stage of the dawg, use the draw mixin and run the draw_graph function: +To draw just one graph that shows the final stage of the dwg, use the draw mixin and run the draw_graph function: ```py from fast_autocomplete import AutoComplete, DrawGraphMixin @@ -289,7 +289,7 @@ demo(autocomplete, max_cost=3, size=5) `pytest` -We try to maintain high standard in code coverage. Currently the `dawg` module's coverage is around 99%! +We try to maintain high standard in code coverage. Currently the `dwg` module's coverage is around 99%! # Authors @@ -306,8 +306,8 @@ We try to maintain high standard in code coverage. Currently the `dawg` module's # FAQ -## Why DAWG -DAWG stands for Directed Acyclic Word Graph. Originally we were using Trie-Tree structure. But soon it was obvious that some branches needed to merge back to other branches. Such as `beemer` and `bmw` branches both need to end in the same node since they are synonyms. Thus we used DAWG. +## Why DWG +DWG stands for Directed Word Graph. Originally we were using Trie-Tree structure. But soon it was obvious that some branches needed to merge back to other branches. Such as `beemer` and `bmw` branches both need to end in the same node since they are synonyms. Thus we used DWG. ## What are synonyms, clean synonyms and partial synonyms Synonyms are words that should produce the same results. @@ -324,10 +324,10 @@ Internally these 2 types of synonyms are treated differently but as a user of th ## Why do you have a whole subtree for partial synonyms Q: Partial synonym means the synonym is a part of the original word. Such as `alfa` is a partial synonym for `alfa romeo`. -In that case you are inserting both `alfa` and `alfa romeo` in the dawg. `alfa` will have `alfa 4c` and `alpha romeo` will have `alfa romeo 4c` branches. Why not just have `alfa` branches to be `alfa romeo` and from there you will have automatically all the sub branches of `alfa romeo`. +In that case you are inserting both `alfa` and `alfa romeo` in the dwg. `alfa` will have `alfa 4c` and `alpha romeo` will have `alfa romeo 4c` branches. Why not just have `alfa` branches to be `alfa romeo` and from there you will have automatically all the sub branches of `alfa romeo`. -Answer: We use letters for edges. So `alfa` can have only one edge coming out of it that is space (` `). And that edge is going to a node that has sub-branches to `alfa romoe`, `alfa 4c` etc. It can't have a ` ` going to that node and another ` ` going to `alfa romeo`'s immediate child. That way when we are traversing the dawg for the input of `alfa 4` we get to the correct node. +Answer: We use letters for edges. So `alfa` can have only one edge coming out of it that is space (` `). And that edge is going to a node that has sub-branches to `alfa romoe`, `alfa 4c` etc. It can't have a ` ` going to that node and another ` ` going to `alfa romeo`'s immediate child. That way when we are traversing the dwg for the input of `alfa 4` we get to the correct node. ## I put Toyota in the Dawg but when I type `toy`, it doesn't show up. -Answer: If you put `Toyota` with capital T in the dawg, it expects the search word to start with capital T too. We suggest that you lower case everything before putting them in dawg. Fast-autocomplete does not automatically do that for you since it assumes the `words` dictionary is what you want to be put in the dawg. It is up to you to clean your own data before putting it in the dawg. +Answer: If you put `Toyota` with capital T in the dwg, it expects the search word to start with capital T too. We suggest that you lower case everything before putting them in dwg. Fast-autocomplete does not automatically do that for you since it assumes the `words` dictionary is what you want to be put in the dwg. It is up to you to clean your own data before putting it in the dwg. diff --git a/README.rst b/README.rst deleted file mode 100644 index 3350a3e..0000000 --- a/README.rst +++ /dev/null @@ -1,436 +0,0 @@ -Fast Autocomplete 0.1.2 -======================= - -Fast autocomplete using Directed Acyclic Word Graph (DAWG) and -Levenshtein Edit Distance. - -The results are cached via LFU (Least Frequently Used). - -Why -=== - -This library was written when we came to the conclusion that -Elasticsearch’s Autocomplete suggestor is not fast enough and doesn’t do -everything that we need: - -1. Once we switched to Fast Autocomplete, our average latency went from - 120ms to 30ms so an improvement of 3-4x in performance and errors - went down to zero. -2. Elasticsearch’s Autocomplete suggestor does not handle any sort of - combination of the words you have put in. For example Fast - Autocomplete can handle ``2018 Toyota Camry in Los Angeles`` when the - words ``2018``, ``Toyota Camry``, ``Los Angeles`` are seperately fed - into it. While Elasticsearch’s autocomplete needs that whole sentence - to be fed to it to show it in Autocomplete results. - -You might say: - -1. Regarding #1: Yes, but you are using caching. Answer: shhh Yes, keep - it quiet. We are also doing Levenshtein Edit distance using a C - library so it improves there too. -2. Regarding #2: I’m speechless. Answer: Ok, now we are talking. - -How -=== - -In a nutshell, what the fast Autocomplete does is: - -1. Populate the DAWG with your words. -2. Follow the graph nodes letter by letter until it finds nodes that - have words in them. -3. Continue after words are found on the graph until it reaches the leaf - node. -4. Restart from the root node again until it reaches a letter that - doesn’t exist on the graph. -5. Depending on how much is left from the rest of the word, return all - the descendant words from where it got stuck -6. Or run Levenshtein edit distance to find closes words to what is left - and the continue from there. - -By doing so, it can tokenize a text such as: - -``2018 Toyota Camry in Los Angeles`` into [``2018``, ``toyota camry``, -``in``, ``los angeles``] - -And return Autocomplete results as you type. - -Install -======= - -``pip install fast-autocomplete`` - -**Note: Fast Autocomplete only works with Python 3.6 and newer.** - -Are you still on Python 2? TIME TO UPGRADE. - -Licence -======= - -MIT - -DAWG -==== - -The data structure we use in this library is called Dawg. - -DAWG stands for Directed Acyclic Word Graph. Here is an example DAWG -based on the “makes_models_short.csv” that is provided in the tests: - -.. figure:: tests/animation/short.gif - :alt: dawg - - dawg - -.. figure:: tests/AutoCompleteWithSynonymsShort_Graph.svg - :alt: dawg - - dawg - -Usage -===== - -First of all lets start from your data. The library leaves it up to you -how to prepare your data. Imagine that we have a csv with the following -content from vehicles’ make and models: - -.. code:: csv - - make,model - acura,zdx - alfa romeo,4c - alfa romeo,4c coupe - alfa romeo,giulia - bmw,1 series - bmw,2 series - 2007,2007 - 2017,2017 - 2018,2018 - -What we want to do is to convert this to a dictionary of words and their -context. - -.. code:: py - - import csv - from fast_autocomplete.misc import read_csv_gen - - - def get_words(path): - - csv_gen = read_csv_gen(path, csv_func=csv.DictReader) - - words = {} - - for line in csv_gen: - make = line['make'] - model = line['model'] - if make != model: - local_words = [model, '{} {}'.format(make, model)] - while local_words: - word = local_words.pop() - if word not in words: - words[word] = {} - if make not in words: - words[make] = {} - return words - -the ``read_csv_gen`` is just a helper function. You don’t really need -it. The whole point is that we are converting that csv to a dictionary -that looks like this: - -.. code:: py - - >>> words = get_words('path to the csv') - >>> words - {'acura zdx': {}, - 'zdx': {}, - 'acura': {}, - 'alfa romeo 4c': {}, - '4c': {}, - 'alfa romeo': {}, - 'alfa romeo 4c coupe': {}, - '4c coupe': {}, - 'alfa romeo giulia': {}, - 'giulia': {}, - 'bmw 1 series': {}, - '1 series': {}, - 'bmw': {}, - 'bmw 2 series': {}, - '2 series': {}, - '2007': {}, - '2017': {}, - '2018': {}} - -This is a dictionary of words to their context. We have decided that we -don’t want any context for the words in this example so all the contexts -are empty. However generally you will want some context around the words -for more complicated logics. The context is used to convert the words -“keys” into their context which is the value of the key in the words -dictionary. - -In addition to words, we usually want a dictionary of synonyms. -Something like this: - -.. code:: py - - synonyms = { - "alfa romeo": ["alfa"], - "bmw": ["beemer", "bimmer"], - "mercedes-benz": ["mercedes", "benz"], - "volkswagen": ["vw"] - } - -Note that synonyms are optional. Maybe in your use case you don’t need -synonyms. - -Now we can use the above to initialize Autocomplete - -.. code:: py - - - from fast_autocomplete import AutoComplete - - autocomplete = AutoComplete(words=words, synonyms=synonyms) - -At this point, AutoComplete has created a `dawg <#DAWG>`__ structure. - -Now you can search! - -- word: the word to return autocomplete results for -- max_cost: Maximum Levenshtein edit distance to be considered when - calculating results -- size: The max number of results to return - -.. code:: py - - >>> autocomplete.search(word='2018 bmw 1', max_cost=3, size=3) - [['2018', 'bmw'], ['2018', 'bmw 1 series']] - -Now what if we pressed a by mistake then? It still works. No problem. - -.. code:: py - - >>> autocomplete.search(word='2018 bmw 1a', max_cost=3, size=3) - [['2018', 'bmw'], ['2018', 'bmw 1 series']] - -Ok let’s search for Alfa now: - -.. code:: py - - >>> autocomplete.search(word='alfa', max_cost=3, size=3) - [['alfa romeo'], ['alfa romeo 4c'], ['alfa romeo giulia']] - -What if we don’t know how to pronounce alfa and we type ``alpha`` ? - -.. code:: py - - >>> autocomplete.search(word='alpha', max_cost=3, size=3) - [['alfa romeo'], ['alfa romeo 4c'], ['alfa romeo giulia']] - -It still works! - -Fast-Autocomplete makes sure the results make sense! - -Ok lets add the word ``Los Angeles`` there to the words: - -.. code:: py - - >>> words['los angeles'] = {} - >>> words['in'] = {} - >>> autocomplete.search(word='2007 alfa in los', max_cost=3, size=3) - [['2007', 'alfa romeo', 'in'], ['2007', 'alfa romeo', 'in', 'los angeles']] - -So far we have not used the context. And this library leaves it up to -you how to use the context. But basically if we giving a context to each -one of those words, then the above response could easly be translated to -the list of those contexts. - -context -------- - -If our words dictionary was: - -.. code:: py - - words = { - 'in': {}, - 'alfa romeo': {'type': 'make'}, - '2007': {'type': 'year'}, - 'los angeles': {'type': 'location'}, - } - -Then the ``autocomplete.words`` can be used to map the results into -their context: - -:: - - [['2007', 'alfa romeo', 'in'], ['2007', 'alfa romeo', 'in', 'los angeles']] - - converted to contexts: - - [[{'year': '2007'}, {'make': alfa romeo'}], [{'year': '2007'}, {'make': alfa romeo'}, {'location': 'los angeles'}]] - -Draw ----- - -This package can actually draw the dawgs as it is populating them or -just once the dawg is populated for you! Here is the animation of -populating the dawg with words from “makes_models_short.csv”: - -Draw animation of dawg populating -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: py - - from fast_autocomplete import AutoComplete, DrawGraphMixin - - - class AutoCompleteDraw(DrawGraphMixin, AutoComplete): - DRAW_POPULATION_ANIMATION = True - DRAW_POPULATION_ANIMATION_PATH = 'animation/short_.svg' - DRAW_POPULATION_ANIMATION_FILENO_PADDING = 6 - - - autocomplete = AutoCompleteDraw(words=words, synonyms=synonyms) - -As soon as you initialize the above AutoCompleteDraw class, it will -populate the dawg and generate the animation! For an example of this -code properly setup, take a look at the tests. In fact the animation in -the `dawg <#dawg>`__ section is generated the same way via unit tests! - -Note that if you have many words, the graph file will be big. Instead of -drawing all frames as the dawg is being populated, you can just draw the -final stage: - -Draw the final graph -~~~~~~~~~~~~~~~~~~~~ - -To draw just one graph that shows the final stage of the dawg, use the -draw mixin and run the draw_graph function: - -.. code:: py - - from fast_autocomplete import AutoComplete, DrawGraphMixin - - - class AutoCompleteDraw(DrawGraphMixin, AutoComplete): - pass - - autocomplete = AutoCompleteDraw(words=words, synonyms=synonyms) - autocomplete.draw_graph('path to file') - -Demo ----- - -If you want to have a real-time interaction with Autocomplete results in -your terminal, you can use the demo module: - -Just pass it an instance of the autocomplete and the search configs: - -.. code:: py - - from fast_autocomplete import demo - - demo(autocomplete, max_cost=3, size=5) - -Develop -======= - -1. Clone the repo -2. Make a virtualenv with Python 3.6 or newer -3. ``pip install -r requirements-dev.txt`` - -Run tests ---------- - -``pytest`` - -We try to maintain high standard in code coverage. Currently the -``dawg`` module’s coverage is around 99%! - -Authors -======= - -- Autocomplete by `Sep Dehpour `__ at `Fair - Financial Corp `__. -- LFU Cache by `Shane Wang `__ - -Other ways of doing AutoComplete -================================ - -1. Elastic search. Yes, Elasticsearch generally is a *better* - Autocomplete solution than this library. I said generally. In our - specific use case, we wanted Autocomplete to be faster than - Elasticsearch and handle combination of words. Otherwise - Elasticsearch would have been perfect. Behind the scene Elasticsearch - uses Finite State Transducer (FST) in Lucene to achive AutoComplete. - FST is more complicated than what we have used in fast-autocomplete. - -2. If your autocomplete is supposed to return results based on a big - blog of text (for example based on some book contents), then a better - solution is to go with Markov chains and conditional probability. - Yes, there is already a library out there for it! - https://github.com/rodricios/autocomplete and it looks great. - Disclaimer: we have not actually used it since it doesn’t fit our - specific use-case. - -FAQ -=== - -Why DAWG --------- - -DAWG stands for Directed Acyclic Word Graph. Originally we were using -Trie-Tree structure. But soon it was obvious that some branches needed -to merge back to other branches. Such as ``beemer`` and ``bmw`` branches -both need to end in the same node since they are synonyms. Thus we used -DAWG. - -What are synonyms, clean synonyms and partial synonyms ------------------------------------------------------- - -Synonyms are words that should produce the same results. - -- For example ``beemer`` and ``bmw`` should both give you ``bmw``. -- ``alfa`` and ``alfa romeo`` should both give you ``alfa romeo`` - -The synonyms get divided into 2 groups: - -1. clean synonyms: The 2 words share little or no words. For example - ``beemer`` vs. ``bmw``. -2. partial synonyms: One of the 2 words is a substring of the other one. - For example ``alfa`` and ``alfa romeo`` or ``gm`` vs. ``gmc``. - -Internally these 2 types of synonyms are treated differently but as a -user of the library, you don’t need to really care about it. You just -provide the synonyms dictionary via defining the ``get_synonyms`` -method. - -Why do you have a whole subtree for partial synonyms ----------------------------------------------------- - -Q: Partial synonym means the synonym is a part of the original word. -Such as ``alfa`` is a partial synonym for ``alfa romeo``. In that case -you are inserting both ``alfa`` and ``alfa romeo`` in the dawg. ``alfa`` -will have ``alfa 4c`` and ``alpha romeo`` will have ``alfa romeo 4c`` -branches. Why not just have ``alfa`` branches to be ``alfa romeo`` and -from there you will have automatically all the sub branches of -``alfa romeo``. - -Answer: We use letters for edges. So ``alfa`` can have only one edge -coming out of it that is space (\` -``). And that edge is going to a node that has sub-branches to``\ alfa -romoe\ ``,``\ alfa 4c\ ``etc. It can't have a`` -``going to that node and another`` ``going to``\ alfa -romeo\ ``'s immediate child. That way when we are traversing the dawg for the input of``\ alfa -4\` we get to the correct node. - -I put Toyota in the Dawg but when I type ``toy``, it doesn’t show up. ---------------------------------------------------------------------- - -Answer: If you put ``Toyota`` with capital T in the dawg, it expects the -search word to start with capital T too. We suggest that you lower case -everything before putting them in dawg. Fast-autocomplete does not -automatically do that for you since it assumes the ``words`` dictionary -is what you want to be put in the dawg. It is up to you to clean your -own data before putting it in the dawg. diff --git a/fast_autocomplete/__init__.py b/fast_autocomplete/__init__.py index a8e2330..f0a4db4 100644 --- a/fast_autocomplete/__init__.py +++ b/fast_autocomplete/__init__.py @@ -1,10 +1,10 @@ # flake8: noqa -__version__ = '0.1.4' +__version__ = '0.1.5' import sys pyversion = float(sys.version[:3]) if pyversion < 3.6: sys.exit('fast-autocomplete requires Python 3.6 or later.') -from fast_autocomplete.dawg import AutoComplete +from fast_autocomplete.dwg import AutoComplete from fast_autocomplete.draw import DrawGraphMixin from fast_autocomplete.demo import demo diff --git a/fast_autocomplete/draw.py b/fast_autocomplete/draw.py index 4365d3a..ba0f1da 100644 --- a/fast_autocomplete/draw.py +++ b/fast_autocomplete/draw.py @@ -26,7 +26,7 @@ def draw_graph(self, file_path): edges = set() que = collections.deque() - que.append(('root', self._dawg, '')) + que.append(('root', self._dwg, '')) node_alternative_names = {} while que: parent_name, node, edge_name = que.popleft() diff --git a/fast_autocomplete/dawg.py b/fast_autocomplete/dwg.py similarity index 94% rename from fast_autocomplete/dawg.py rename to fast_autocomplete/dwg.py index f14b0ed..7a44086 100644 --- a/fast_autocomplete/dawg.py +++ b/fast_autocomplete/dwg.py @@ -33,7 +33,7 @@ def __init__(self, words, synonyms=None): The synonym words should only be here and not repeated in words parameter. """ self._lock = Lock() - self._dawg = None + self._dwg = None self._raw_synonyms = synonyms or {} self._lfu_cache = LFUCache(self.CACHE_SIZE) self._clean_synonyms, self._partial_synonyms = self._get_clean_and_partial_synonyms() @@ -41,7 +41,7 @@ def __init__(self, words, synonyms=None): self.words = words new_words = self._get_partial_synonyms_to_words() self.words.update(new_words) - self._populate_dawg() + self._populate_dwg() def _get_clean_and_partial_synonyms(self): """ @@ -104,11 +104,11 @@ def _get_partial_synonyms_to_words(self): new_words[new_key] = value return new_words - def _populate_dawg(self): - if not self._dawg: + def _populate_dwg(self): + if not self._dwg: with self._lock: - if not self._dawg: - self._dawg = _DawgNode() + if not self._dwg: + self._dwg = _DawgNode() for word, value in self.words.items(): original_key = value.get(ORIGINAL_KEY) word = word.strip().lower() @@ -129,8 +129,8 @@ def insert_word_branch(self, word, leaf_node=None, add_word=True, original_key=N """ Inserts a word into the Dawg. - :param word: The word to be inserted as a branch of dawg - :param leaf_node: (optional) The leaf node for the node to merge into in the dawg. + :param word: The word to be inserted as a branch of dwg + :param leaf_node: (optional) The leaf node for the node to merge into in the dwg. :param add_word: (Boolean, default: True) Add the word itself at the end of the branch. Usually this is set to False if we are merging into a leaf node and do not want to add the actual word there. @@ -140,10 +140,10 @@ def insert_word_branch(self, word, leaf_node=None, add_word=True, original_key=N """ if leaf_node: - temp_leaf_node = self._dawg.insert(word[:-1], add_word=add_word, original_key=original_key) + temp_leaf_node = self._dwg.insert(word[:-1], add_word=add_word, original_key=original_key) temp_leaf_node.children[word[-1]] = leaf_node else: - leaf_node = self._dawg.insert(word, original_key=original_key) + leaf_node = self._dwg.insert(word, original_key=original_key) self.insert_word_callback(word) return leaf_node @@ -281,7 +281,7 @@ def _add_words(words): word = matched_prefix_of_last_word + rest_of_word word = word.strip() len_prev_rest_of_last_word = len_rest_of_last_word - matched_prefix_of_last_word, rest_of_word, node, matched_words_part = self._prefix_autofill_part(word, node=self._dawg) + matched_prefix_of_last_word, rest_of_word, node, matched_words_part = self._prefix_autofill_part(word, node=self._dwg) is_added = _add_words(matched_words_part) if is_added is False: break @@ -291,7 +291,7 @@ def _add_words(words): return result def _prefix_autofill_part(self, word, node=None): - node = node or self._dawg + node = node or self._dwg que = collections.deque(word) matched_prefix_of_last_word = '' @@ -316,7 +316,7 @@ def _prefix_autofill_part(self, word, node=None): matched_words.append(node.value) else: if char == ' ': - node = self._dawg + node = self._dwg else: que.appendleft(char) break @@ -377,10 +377,13 @@ def __repr__(self): def get_descendants_nodes(self, size): que = collections.deque() + unique_nodes = {self} found_words_set = set() for letter, child_node in self.children.items(): - que.append((letter, child_node)) + if child_node not in unique_nodes: + unique_nodes.add(child_node) + que.append((letter, child_node)) while que: letter, child_node = que.popleft() @@ -393,7 +396,9 @@ def get_descendants_nodes(self, size): break for letter, grand_child_node in child_node.children.items(): - que.append((letter, grand_child_node)) + if grand_child_node not in unique_nodes: + unique_nodes.add(grand_child_node) + que.append((letter, grand_child_node)) def get_descendants_words(self, size): found_words_gen = self.get_descendants_nodes(size) diff --git a/get_readme_rst.sh b/get_readme_rst.sh deleted file mode 100755 index 401be51..0000000 --- a/get_readme_rst.sh +++ /dev/null @@ -1 +0,0 @@ -pandoc --from=markdown --to=rst --output=README.rst README.md diff --git a/setup.py b/setup.py index 4df942b..b64e2ec 100644 --- a/setup.py +++ b/setup.py @@ -22,14 +22,16 @@ def get_reqs(filename): reqs = get_reqs("requirements.txt") try: - with open('README.rst') as file: + with open('README.md') as file: long_description = file.read() except Exception: long_description = "Autocomplete" setup( name='fast-autocomplete', - description=long_description, + description='Fast Autocomplete using Directed Word Graph', + long_description=long_description, + long_description_content_type='text/markdown', author='Sep Dehpour', url='https://github.com/wearefair/fast-autocomplete', download_url='https://github.com/wearefair/fast-autocomplete/tarball/master', diff --git a/tests/AutoCompleteWithSynonymsShort_Graph.svg b/tests/AutoCompleteWithSynonymsShort_Graph.svg index 6e737fc..ad01d46 100644 --- a/tests/AutoCompleteWithSynonymsShort_Graph.svg +++ b/tests/AutoCompleteWithSynonymsShort_Graph.svg @@ -4,1317 +4,1378 @@ - + - + .0 - + .1 - + .0->.1 - - -a + + +a .2 - + .0->.2 - - -z + + +z .3 - + .0->.3 - - -4 + + +4 .4 - + .0->.4 - - -g + + +g .5 - + .0->.5 - - -b + + +b .6 - + .0->.6 - - -1 + + +1 .7 - + .0->.7 - - -2 + + +2 - + .8 - + - - -.1->.8 - - -c + + +.0->.8 + + +t .9 - + .1->.9 - - -l + + +c .10 - + - + -.2->.10 - - -d +.1->.10 + + +l - + -4c - -4c +.11 + - + -.3->4c - - -c +.2->.11 + + +d - + -.12 - +4c + +4c - + -.4->.12 - - -i +.3->4c + + +c .13 - + - + -.5->.13 - - -m +.4->.13 + + +i .14 - + - + -.6->.14 - - -' ' +.5->.14 + + +m .15 - + - + -.7->.15 - - -' ' +.6->.15 + + +' ' .16 - + .7->.16 - - -0 - - - -root - -root - - - -root->.0 - - + + +' ' .17 - + - + -.8->.17 - - -u +.7->.17 + + +0 .18 - + - + -.9->.18 - - -f +.8->.18 + + +r - + + +root + +root + + + +root->.0 + + + + -zdx - -zdx +.19 + - + -.10->zdx - - -x +.9->.19 + + +u .20 - + - + -4c->.20 - - -' ' +.10->.20 + + +f - + -.21 - +zdx + +zdx - + -.12->.21 - - -u +.11->zdx + + +x - + -bmw - -bmw +.22 + - + -.13->bmw - - -w +4c->.22 + + +' ' .23 - + - + -.14->.23 - - -s +.13->.23 + + +u - + -.24 - +bmw + +bmw - + -.15->.24 - - -s +.14->bmw + + +w .25 - + - + -.16->.25 - - -0 +.15->.25 + + +s .26 - + .16->.26 - - -1 + + +s .27 - + .17->.27 - - -r + + +0 - + -alfa - -alfa +.28 + - + -.18->alfa - - -a +.17->.28 + + +1 .29 - + - + -.20->.29 - - -c +.18->.29 + + +u .30 - + - + -.21->.30 - - -l +.19->.30 + + +r - + -.31 - +alfa + +alfa - + -bmw->.31 - - -' ' +.20->alfa + + +a .32 - + - + -.23->.32 - - -e +.22->.32 + + +c .33 - + - + -.24->.33 - - -e +.23->.33 + + +l - + -2007 - -2007 +.34 + - + -.25->2007 - - -7 +bmw->.34 + + +' ' - + -2017 - -2017 +.35 + - + -.26->2017 - - -7 +.25->.35 + + +e - + -2018 - -2018 +.36 + - + -.26->2018 - - -8 +.26->.36 + + +e - + -acura - -acura +2007 + +2007 - + -.27->acura - - -a +.27->2007 + + +7 - + -.38 - +2017 + +2017 - + -alfa->.38 - - -' ' +.28->2017 + + +7 - + -.39 - +2018 + +2018 - + -.29->.39 - - -o +.28->2018 + + +8 .40 - + - + -.30->.40 - - -i +.29->.40 + + +c - + -.41 - +acura + +acura - + -.31->.41 - - -1 +.30->acura + + +a .42 - + - + -.31->.42 - - -2 +alfa->.42 + + +' ' .43 - + .32->.43 - - -r + + +o .44 - + .33->.44 - - -r + + +i .45 - + - + -acura->.45 - - -' ' +.34->.45 + + +1 .46 - + - + -.38->.46 - - -r +.34->.46 + + +2 .47 - + - + -.38->.47 - - -4 +.35->.47 + + +r .48 - + - + -.38->.48 - - -g +.36->.48 + + +r - + -.49 - +truck + +truck - + -.39->.49 - - -u +.40->truck + + +k - + -giulia - -giulia +.50 + - + -.40->giulia - - -a +acura->.50 + + +' ' .51 - + - + -.41->.51 - - -' ' +.42->.51 + + +r .52 - + .42->.52 - - -' ' + + +4 .53 - + - + -.43->.53 - - -i +.42->.53 + + +g .54 - + - + -.44->.54 - - -i +.43->.54 + + +u - + -.55 - +giulia + +giulia - + -.45->.55 - - -z +.44->giulia + + +a .56 - + - + -.46->.56 - - -o +.45->.56 + + +' ' - + -alfa 4c - -alfa 4c +.57 + - + -.47->alfa 4c - - -c +.46->.57 + + +' ' .58 - + - + -.48->.58 - - -i +.47->.58 + + +i .59 - + - + -.49->.59 - - -p +.48->.59 + + +i .60 - + - + -.51->.60 - - -s +.50->.60 + + +z .61 - + - + -.52->.61 - - -s +.51->.61 + + +o - + -.62 - +alfa 4c + +alfa 4c - + -.53->.62 - - -e +.52->alfa 4c + + +c .63 - + - + -.54->.63 - - -e +.53->.63 + + +i .64 - + - + -.55->.64 - - -d +.54->.64 + + +p .65 - + .56->.65 - - -m + + +s .66 - + - + -alfa 4c->.66 - - -' ' +.57->.66 + + +s .67 - + .58->.67 - - -u + + +e - + -4c coupe - -4c coupe +.68 + - + -.59->4c coupe - - -e +.59->.68 + + +e .69 - + .60->.69 - - -e + + +d .70 - + .61->.70 - - -e + + +m - + -1 series - -1 series +.71 + - + -.62->1 series - - -s +alfa 4c->.71 + + +' ' - + -2 series - -2 series +.72 + - + -.63->2 series - - -s +.63->.72 + + +u - + -acura zdx - -acura zdx +4c coupe + +4c coupe - + -.64->acura zdx - - -x +.64->4c coupe + + +e .74 - + .65->.74 - - -e + + +e .75 - + .66->.75 - - -c + + +e - + -.76 - +1 series + +1 series - + -.67->.76 - - -l +.67->1 series + + +s - + -.77 - +2 series + +2 series - + -.69->.77 - - -r +.68->2 series + + +s - + -.78 - +acura zdx + +acura zdx - + -.70->.78 - - -r +.69->acura zdx + + +x - + -alfa romeo - -alfa romeo +.79 + - + -.74->alfa romeo - - -o +.70->.79 + + +e .80 - + - + -.75->.80 - - -o +.71->.80 + + +c .81 - + - + -.76->.81 - - -i +.72->.81 + + +l .82 - + - + -.77->.82 - - -i +.74->.82 + + +r .83 - + - + -.78->.83 - - -i +.75->.83 + + +r - + -.84 - +alfa romeo + +alfa romeo - + -alfa romeo->.84 - - -' ' +.79->alfa romeo + + +o .85 - + .80->.85 - - -u + + +o - + -alfa giulia - -alfa giulia +.86 + - + -.81->alfa giulia - - -a +.81->.86 + + +i .87 - + .82->.87 - - -e + + +i .88 - + .83->.88 - - -e + + +i .89 - + - + -.84->.89 - - -4 +alfa romeo->.89 + + +' ' .90 - + - + -.84->.90 - - -g +.85->.90 + + +u - + -.91 - +alfa giulia + +alfa giulia - + -.85->.91 - - -p +.86->alfa giulia + + +a - + -bmw 1 series - -bmw 1 series +.92 + - + -.87->bmw 1 series - - -s +.87->.92 + + +e - + -bmw 2 series - -bmw 2 series +.93 + - + -.88->bmw 2 series - - -s +.88->.93 + + +e - + -alfa romeo 4c - -alfa romeo 4c +.94 + - + -.89->alfa romeo 4c - - -c +.89->.94 + + +4 .95 - + - + -.90->.95 - - -i +.89->.95 + + +g - + -alfa 4c coupe - -alfa 4c coupe +.96 + - + -.91->alfa 4c coupe - - -e +.90->.96 + + +p - + -.97 - +bmw 1 series + +bmw 1 series - + -alfa romeo 4c->.97 +.92->bmw 1 series + + +s + + + +bmw 2 series + +bmw 2 series + + + +.93->bmw 2 series + + +s + + + +alfa romeo 4c + +alfa romeo 4c + + + +.94->alfa romeo 4c + + +c + + + +.100 + + + + +.95->.100 + + +i + + + +alfa 4c coupe + +alfa 4c coupe + + + +.96->alfa 4c coupe + + +e + + + +.102 + + + + +alfa romeo 4c->.102 ' ' - - -.98 + + +.103 - - -.95->.98 - - + + +.100->.103 + + u - - -.99 + + +.104 - - -.97->.99 + + +.102->.104 c - - -.100 + + +.105 - - -.98->.100 + + +.103->.105 l - - -.101 + + +.106 - - -.99->.101 + + +.104->.106 o - - -.102 + + +.107 - - -.100->.102 + + +.105->.107 i - - -.103 + + +.108 - - -.101->.103 + + +.106->.108 u - + alfa romeo giulia alfa romeo giulia - - -.102->alfa romeo giulia + + +.107->alfa romeo giulia a - - -.105 + + +.110 - - -.103->.105 + + +.108->.110 p - + alfa romeo 4c coupe alfa romeo 4c coupe - - -.105->alfa romeo 4c coupe + + +.110->alfa romeo 4c coupe e diff --git a/tests/fixtures/synonyms.json b/tests/fixtures/synonyms.json index ca43e60..4d6733f 100644 --- a/tests/fixtures/synonyms.json +++ b/tests/fixtures/synonyms.json @@ -2,5 +2,6 @@ "alfa romeo": ["alfa"], "bmw": ["beemer", "bimmer"], "mercedes-benz": ["mercedes", "benz"], - "volkswagen": ["vw"] + "volkswagen": ["vw"], + "truck": ["trucks"] } diff --git a/tests/test_autocomplete.py b/tests/test_autocomplete.py index 5d949d2..0c12262 100644 --- a/tests/test_autocomplete.py +++ b/tests/test_autocomplete.py @@ -7,7 +7,7 @@ from pprint import pprint from fast_autocomplete.misc import read_csv_gen from fast_autocomplete import AutoComplete, DrawGraphMixin -from fast_autocomplete.dawg import FindStep +from fast_autocomplete.dwg import FindStep current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -56,6 +56,8 @@ def get_words(path): words[word] = dict(line) if make not in words: words[make] = {"make": make} + + words['truck'] = {'make': 'truck'} return words @@ -266,6 +268,20 @@ def test_search_without_synonyms(self, word, max_cost, size, expected_results): 'expected_steps': STEP_DESCENDANTS_ONLY, 'expected_find_and_sort_results': [['type', 'type r']], }, + {'word': 'truck', + 'max_cost': 3, + 'size': 5, + 'expected_find_results': {0: [['truck']]}, + 'expected_steps': STEP_DESCENDANTS_ONLY, + 'expected_find_and_sort_results': [['truck']], + }, + {'word': 'trucks', + 'max_cost': 3, + 'size': 5, + 'expected_find_results': {0: [['truck']]}, + 'expected_steps': STEP_DESCENDANTS_ONLY, + 'expected_find_and_sort_results': [['truck']], + }, ] @@ -358,7 +374,7 @@ def test_prefix_autofill(self, word, expected_matched_prefix_of_last_word, print(f'node: {node}') print(f'expected_matched_words: {expected_matched_words}') print(f'matched_words: {matched_words}') - expected_node = auto_complete._dawg + expected_node = auto_complete._dwg for k in expected_node_path.split(','): expected_node = expected_node[k] assert expected_node is node