Skip to content

Commit

Permalink
adding sorting example
Browse files Browse the repository at this point in the history
  • Loading branch information
seperman committed Oct 24, 2019
1 parent 69e0ad5 commit 767b9e0
Show file tree
Hide file tree
Showing 5 changed files with 357 additions and 0 deletions.
83 changes: 83 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ DWG stands for Directed Word Graph. Here is an example DWG based on the "makes_m
# Usage

First of all lets start from your data. The library leaves it up to you how to prepare your data.
If you want to go straight to the factory function that lets you use the library in its easiest and most common case, skip all these and jump to the [sort](#sort) example.

## Example 1

Expand Down Expand Up @@ -247,6 +248,88 @@ converted to contexts:
[[{'year': '2007'}, {'make': alfa romeo'}], [{'year': '2007'}, {'make': alfa romeo'}, {'location': 'los angeles'}]]
```

## Sorting

Most people who use Fast Autocomplete, want to control how results are sorted. If you don't control that, the results will be sorted based on the order that Autocomplete found the nodes in the graph that matched the criteria.

The easiest way to sort is to give each item a count.

For example:

1. Make a json file that is a dictionary of words to their context.

The format of the file needs to be:

```json

{
word: [
context,
display value,
count
]
}
```

An example is included in the <tests/fixtures/sample_words.json>

```json
{
"acura rlx": [
{
"model": "rlx",
"make": "acura"
},
"Acura RLX",
3132
],
"rlx": [
{
"model": "rlx",
"make": "acura"
},
"Acura RLX",
3132
],
"acura": [
{
"make": "acura"
},
"Acura",
130123
],
...
}
```

You might be wondering why things are in this format. It is to save space when this json can become very big easily and the keys become repetitive. That's why we are using a list we predefined order of keys. For your use case for now you can leave the context and display values as None.

2. Launch Autocomplete via the factory function:

```py
from fast_autocomplete import autocomplete_factory

content_files = {
'words': {
'filepath': path/to/sample_words.json,
'compress': True # means compress the graph data in memory
}
}

autocomplete = autocomplete_factory(content_files=content_files)
```

3. You can use Autocomplete and the results are ordered by count!


```py
>>> autocomplete.search(word='acu')
[['acura'], ['acura mdx'], ['acura rdx']]
```

4. How do we use the context and display value now?

Great question. You need to extend AutoComplete class to use these items. I will write a blog post about it.

## Draw

Expand Down
1 change: 1 addition & 0 deletions fast_autocomplete/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@
from fast_autocomplete.dwg import AutoComplete
from fast_autocomplete.draw import DrawGraphMixin
from fast_autocomplete.demo import demo
from fast_autocomplete.loader import autocomplete_factory
from fast_autocomplete.normalize import normalize_node_name, remove_any_special_character
139 changes: 139 additions & 0 deletions fast_autocomplete/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import os
import gzip
import json
import logging
try:
from redis import StrictRedis
except ImportError:
StrictRedis = None

from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, Union
from fast_autocomplete import AutoComplete


def read_local_dump(filepath: str):
with open(filepath, 'r') as the_file:
return the_file.read()


def _simple_compress(item: str, hash_to_val: Dict[int, str]) -> str:
item_hash = hash(item)
if item_hash in hash_to_val:
item = hash_to_val[item_hash]
else:
hash_to_val[item_hash] = item
return item


class WordValue(NamedTuple):
context: Any
display: Any
count: int = 0
original_key: 'WordValue' = None

def get(self, key: str, default: Optional[str] = None) -> str:
result = getattr(self, key)
if result is None:
result = default
return result


def get_all_content(content_files, redis_client=None, redis_key_prefix=None, logger=None):
"""
Get all content that is needed to initialize Autocomplete.
:param: redis_client (optional) If passed, it tries to load from Redis if there is already cached data
"""
kwargs = {}
for key, info in content_files.items():
kwargs[key] = get_data(
filepath=info['filepath'],
compress=info['compress'],
redis_client=redis_client,
redis_key_prefix=redis_key_prefix,
logger=logger
)
if logger:
kwargs['logger'] = logger
return kwargs


def get_data(filepath: str, compress: bool = False,
redis_client: Optional[StrictRedis] = None,
redis_key_prefix: Optional[str] = None,
logger: Optional[logging.RootLogger] = None) -> Dict[str, List[str]]:
data_json = None
filename = os.path.basename(filepath)
if redis_client and redis_key_prefix:
key = redis_key_prefix.format(filename)
try:
data_json = redis_client.get(key)
except Exception:
if logger:
logger.exception('Unable to get the search graph words from Redis.')
else:
print('Unable to get the search graph words from Redis.')
if data_json:
data_json = gzip.decompress(data_json).decode('utf-8')
if not data_json:
data_json = read_local_dump(filepath)
data = json.loads(data_json)

if compress:
hash_to_val = {}

for word, value in data.items():
context, display, count = value
display = _simple_compress(item=display, hash_to_val=hash_to_val)
for key, val in context.items():
context[key] = _simple_compress(
item=context[key], hash_to_val=hash_to_val
)
data[word] = WordValue(context=context, display=display, count=count)

return data


def populate_redis(content_files, redis_client, redis_cache_prefix):
"""
Populate Redis with data based on the local files
"""
for key, info in content_files.items():
filename = os.path.basename(info['filepath'])
redis_key = redis_cache_prefix.format(filename)
data = read_local_dump(info['filepath'])
compressed = gzip.compress(data.encode('utf-8'))
redis_client.set(redis_key, compressed)


def autocomplete_factory(
content_files, redis_client=None, module=AutoComplete, logger=None
):
"""
Factory function to initialize the proper Vehicle Autocomplete object
:param: content_files: The file paths and options where data is stored.
Example
content_files = {
'synonyms': {
'filename': 'path/to/synonyms.json',
'compress': False
},
'words': {
'filename': 'path/to/words.json',
'compress': True
},
'full_stop_words': {
'filename': 'path/to/full_stop_words.json',
'compress': False
}
}
:param: redis_client: (optional) If passed, the factor function tries to load the data from Redis
and if that fails, it will load the local data.
:param: module: (optional) The AutoComplete module to initialize
"""
kwargs = get_all_content(content_files, redis_client=redis_client, logger=logger)
return module(**kwargs)
97 changes: 97 additions & 0 deletions tests/fixtures/sample_words.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
{
"acura rlx": [
{
"model": "rlx",
"make": "acura"
},
"Acura RLX",
3132
],
"rlx": [
{
"model": "rlx",
"make": "acura"
},
"Acura RLX",
3132
],
"acura": [
{
"make": "acura"
},
"Acura",
130123
],
"acura rlx sport hybrid": [
{
"model": "rlx sport hybrid",
"make": "acura"
},
"Acura RLX Sport Hybrid",
4
],
"rlx sport hybrid": [
{
"model": "rlx sport hybrid",
"make": "acura"
},
"Acura RLX Sport Hybrid",
4
],
"acura ilx": [
{
"model": "ilx--ilx hybrid",
"make": "acura"
},
"Acura ILX",
19936
],
"ilx": [
{
"model": "ilx--ilx hybrid",
"make": "acura"
},
"Acura ILX",
19936
],
"acura mdx": [
{
"model": "mdx",
"make": "acura"
},
"Acura MDX",
35290
],
"mdx": [
{
"model": "mdx",
"make": "acura"
},
"Acura MDX",
35290
],
"acura nsx": [
{
"model": "nsx",
"make": "acura"
},
"Acura NSX",
271
],
"nsx": [
{
"model": "nsx",
"make": "acura"
},
"Acura NSX",
271
],
"acura rdx": [
{
"model": "rdx",
"make": "acura"
},
"Acura RDX",
33905
]
}
37 changes: 37 additions & 0 deletions tests/test_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
import pytest
from fast_autocomplete import autocomplete_factory, AutoComplete

current_dir = os.path.dirname(os.path.abspath(__file__))
fixture_dir = os.path.join(current_dir, 'fixtures')

content_files = {
'words': {
'filepath': os.path.join(fixture_dir, 'sample_words.json'),
'compress': True # means compress the graph data in memory
}
}

autocomplete = autocomplete_factory(content_files=content_files)


class AutoCompleteIgnoreCount(AutoComplete):
SHOULD_INCLUDE_COUNT = False


autocomplete_ignore_count = autocomplete_factory(content_files=content_files, module=AutoCompleteIgnoreCount)


class TestLoader:

@pytest.mark.parametrize('word, expected_result, expected_unsorted_result', [
('acu',
[['acura'], ['acura mdx'], ['acura rdx']],
[['acura'], ['acura rlx'], ['acura rdx']]),
])
def test_loader(self, word, expected_result, expected_unsorted_result):
result = autocomplete.search(word=word, size=3)
assert expected_result == result

result = autocomplete_ignore_count.search(word=word, size=3)
assert expected_unsorted_result == result

0 comments on commit 767b9e0

Please sign in to comment.