diff --git a/README.md b/README.md index 005e00d..153df9d 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ DWG stands for Directed Word Graph. Here is an example DWG based on the "makes_m # Usage First of all lets start from your data. The library leaves it up to you how to prepare your data. +If you want to go straight to the factory function that lets you use the library in its easiest and most common case, skip all these and jump to the [sort](#sort) example. ## Example 1 @@ -247,6 +248,88 @@ converted to contexts: [[{'year': '2007'}, {'make': alfa romeo'}], [{'year': '2007'}, {'make': alfa romeo'}, {'location': 'los angeles'}]] ``` +## Sorting + +Most people who use Fast Autocomplete, want to control how results are sorted. If you don't control that, the results will be sorted based on the order that Autocomplete found the nodes in the graph that matched the criteria. + +The easiest way to sort is to give each item a count. + +For example: + +1. Make a json file that is a dictionary of words to their context. + +The format of the file needs to be: + +```json + +{ + word: [ + context, + display value, + count + ] +} +``` + +An example is included in the + +```json +{ + "acura rlx": [ + { + "model": "rlx", + "make": "acura" + }, + "Acura RLX", + 3132 + ], + "rlx": [ + { + "model": "rlx", + "make": "acura" + }, + "Acura RLX", + 3132 + ], + "acura": [ + { + "make": "acura" + }, + "Acura", + 130123 + ], + ... +} +``` + +You might be wondering why things are in this format. It is to save space when this json can become very big easily and the keys become repetitive. That's why we are using a list we predefined order of keys. For your use case for now you can leave the context and display values as None. + +2. Launch Autocomplete via the factory function: + +```py +from fast_autocomplete import autocomplete_factory + +content_files = { + 'words': { + 'filepath': path/to/sample_words.json, + 'compress': True # means compress the graph data in memory + } +} + +autocomplete = autocomplete_factory(content_files=content_files) +``` + +3. You can use Autocomplete and the results are ordered by count! + + +```py +>>> autocomplete.search(word='acu') +[['acura'], ['acura mdx'], ['acura rdx']] +``` + +4. How do we use the context and display value now? + +Great question. You need to extend AutoComplete class to use these items. I will write a blog post about it. ## Draw diff --git a/fast_autocomplete/__init__.py b/fast_autocomplete/__init__.py index 84f095d..067e8cb 100644 --- a/fast_autocomplete/__init__.py +++ b/fast_autocomplete/__init__.py @@ -11,4 +11,5 @@ from fast_autocomplete.dwg import AutoComplete from fast_autocomplete.draw import DrawGraphMixin from fast_autocomplete.demo import demo +from fast_autocomplete.loader import autocomplete_factory from fast_autocomplete.normalize import normalize_node_name, remove_any_special_character diff --git a/fast_autocomplete/loader.py b/fast_autocomplete/loader.py new file mode 100644 index 0000000..7f6ed29 --- /dev/null +++ b/fast_autocomplete/loader.py @@ -0,0 +1,139 @@ +import os +import gzip +import json +import logging +try: + from redis import StrictRedis +except ImportError: + StrictRedis = None + +from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, Union +from fast_autocomplete import AutoComplete + + +def read_local_dump(filepath: str): + with open(filepath, 'r') as the_file: + return the_file.read() + + +def _simple_compress(item: str, hash_to_val: Dict[int, str]) -> str: + item_hash = hash(item) + if item_hash in hash_to_val: + item = hash_to_val[item_hash] + else: + hash_to_val[item_hash] = item + return item + + +class WordValue(NamedTuple): + context: Any + display: Any + count: int = 0 + original_key: 'WordValue' = None + + def get(self, key: str, default: Optional[str] = None) -> str: + result = getattr(self, key) + if result is None: + result = default + return result + + +def get_all_content(content_files, redis_client=None, redis_key_prefix=None, logger=None): + """ + Get all content that is needed to initialize Autocomplete. + + :param: redis_client (optional) If passed, it tries to load from Redis if there is already cached data + """ + kwargs = {} + for key, info in content_files.items(): + kwargs[key] = get_data( + filepath=info['filepath'], + compress=info['compress'], + redis_client=redis_client, + redis_key_prefix=redis_key_prefix, + logger=logger + ) + if logger: + kwargs['logger'] = logger + return kwargs + + +def get_data(filepath: str, compress: bool = False, + redis_client: Optional[StrictRedis] = None, + redis_key_prefix: Optional[str] = None, + logger: Optional[logging.RootLogger] = None) -> Dict[str, List[str]]: + data_json = None + filename = os.path.basename(filepath) + if redis_client and redis_key_prefix: + key = redis_key_prefix.format(filename) + try: + data_json = redis_client.get(key) + except Exception: + if logger: + logger.exception('Unable to get the search graph words from Redis.') + else: + print('Unable to get the search graph words from Redis.') + if data_json: + data_json = gzip.decompress(data_json).decode('utf-8') + if not data_json: + data_json = read_local_dump(filepath) + data = json.loads(data_json) + + if compress: + hash_to_val = {} + + for word, value in data.items(): + context, display, count = value + display = _simple_compress(item=display, hash_to_val=hash_to_val) + for key, val in context.items(): + context[key] = _simple_compress( + item=context[key], hash_to_val=hash_to_val + ) + data[word] = WordValue(context=context, display=display, count=count) + + return data + + +def populate_redis(content_files, redis_client, redis_cache_prefix): + """ + Populate Redis with data based on the local files + """ + for key, info in content_files.items(): + filename = os.path.basename(info['filepath']) + redis_key = redis_cache_prefix.format(filename) + data = read_local_dump(info['filepath']) + compressed = gzip.compress(data.encode('utf-8')) + redis_client.set(redis_key, compressed) + + +def autocomplete_factory( + content_files, redis_client=None, module=AutoComplete, logger=None +): + """ + Factory function to initialize the proper Vehicle Autocomplete object + + :param: content_files: The file paths and options where data is stored. + + Example + + content_files = { + 'synonyms': { + 'filename': 'path/to/synonyms.json', + 'compress': False + }, + 'words': { + 'filename': 'path/to/words.json', + 'compress': True + }, + 'full_stop_words': { + 'filename': 'path/to/full_stop_words.json', + 'compress': False + } + } + + :param: redis_client: (optional) If passed, the factor function tries to load the data from Redis + and if that fails, it will load the local data. + :param: module: (optional) The AutoComplete module to initialize + """ + kwargs = get_all_content(content_files, redis_client=redis_client, logger=logger) + return module(**kwargs) diff --git a/tests/fixtures/sample_words.json b/tests/fixtures/sample_words.json new file mode 100644 index 0000000..3e93ce8 --- /dev/null +++ b/tests/fixtures/sample_words.json @@ -0,0 +1,97 @@ +{ + "acura rlx": [ + { + "model": "rlx", + "make": "acura" + }, + "Acura RLX", + 3132 + ], + "rlx": [ + { + "model": "rlx", + "make": "acura" + }, + "Acura RLX", + 3132 + ], + "acura": [ + { + "make": "acura" + }, + "Acura", + 130123 + ], + "acura rlx sport hybrid": [ + { + "model": "rlx sport hybrid", + "make": "acura" + }, + "Acura RLX Sport Hybrid", + 4 + ], + "rlx sport hybrid": [ + { + "model": "rlx sport hybrid", + "make": "acura" + }, + "Acura RLX Sport Hybrid", + 4 + ], + "acura ilx": [ + { + "model": "ilx--ilx hybrid", + "make": "acura" + }, + "Acura ILX", + 19936 + ], + "ilx": [ + { + "model": "ilx--ilx hybrid", + "make": "acura" + }, + "Acura ILX", + 19936 + ], + "acura mdx": [ + { + "model": "mdx", + "make": "acura" + }, + "Acura MDX", + 35290 + ], + "mdx": [ + { + "model": "mdx", + "make": "acura" + }, + "Acura MDX", + 35290 + ], + "acura nsx": [ + { + "model": "nsx", + "make": "acura" + }, + "Acura NSX", + 271 + ], + "nsx": [ + { + "model": "nsx", + "make": "acura" + }, + "Acura NSX", + 271 + ], + "acura rdx": [ + { + "model": "rdx", + "make": "acura" + }, + "Acura RDX", + 33905 + ] +} diff --git a/tests/test_loader.py b/tests/test_loader.py new file mode 100644 index 0000000..1ecaf18 --- /dev/null +++ b/tests/test_loader.py @@ -0,0 +1,37 @@ +import os +import pytest +from fast_autocomplete import autocomplete_factory, AutoComplete + +current_dir = os.path.dirname(os.path.abspath(__file__)) +fixture_dir = os.path.join(current_dir, 'fixtures') + +content_files = { + 'words': { + 'filepath': os.path.join(fixture_dir, 'sample_words.json'), + 'compress': True # means compress the graph data in memory + } +} + +autocomplete = autocomplete_factory(content_files=content_files) + + +class AutoCompleteIgnoreCount(AutoComplete): + SHOULD_INCLUDE_COUNT = False + + +autocomplete_ignore_count = autocomplete_factory(content_files=content_files, module=AutoCompleteIgnoreCount) + + +class TestLoader: + + @pytest.mark.parametrize('word, expected_result, expected_unsorted_result', [ + ('acu', + [['acura'], ['acura mdx'], ['acura rdx']], + [['acura'], ['acura rlx'], ['acura rdx']]), + ]) + def test_loader(self, word, expected_result, expected_unsorted_result): + result = autocomplete.search(word=word, size=3) + assert expected_result == result + + result = autocomplete_ignore_count.search(word=word, size=3) + assert expected_unsorted_result == result