-
Notifications
You must be signed in to change notification settings - Fork 1
/
helper.py
121 lines (98 loc) · 3.75 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
""" Contains helper classes/functions for searching results in index """
import lsh_search_engine
import pickle
import os
from pathlib import Path
from collections import OrderedDict
class LRUCache:
""" Class to implement LRU Cache """
def __init__(self, capacity: int):
""" Initializes an Ordered Dictionary and cache capacity """
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key: int) -> int:
"""
Returns the value of the key that is queried in O(1) and return -1 if the key
is not found in dict / cache. Moves the key to the end to mark that it
was recently used.
"""
if key not in self.cache:
return -1
else:
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key: int, value: int) -> None:
"""
Adds/Updates the key by conventional methods. Moves the key to the end to mark
that it was recently used. Checks whether the length of our ordered dictionary
has exceeded the cache capacity, If so then removes the first key (least recently used)
"""
self.cache[key] = value
self.cache.move_to_end(key)
if len(self.cache) > self.capacity:
self.cache.popitem(last=False)
def get_data_for_docId(docId):
""" Given a docId returns the species type and dna sequence """
dir = os.listdir(lsh_search_engine.DATASET_PATH)
dir.sort()
doc_name = dir[docId]
document = open(lsh_search_engine.DATASET_PATH + "/" + doc_name, "r")
dna_seq = document.read()
return doc_name, dna_seq
def process_query(query):
"""
Takes in a `query` string, returns the `buckets` formed by performing LSH on it.
"""
# Create Shingle Matrix
file_obj = open("shingles.pkl", "rb")
shingles = pickle.load(
file_obj
) # Assuming that shingles in query is a subset of shingles in entire dataset
file_obj = open("shingle_id.pkl", "rb")
shingles_dict = pickle.load(file_obj)
shingle_matrix = lsh_search_engine.create_matrix_query(
shingles, shingles_dict, query
)
# Create Signature Matrix by Minhashing
signature_matrix = lsh_search_engine.get_signature_matrix(shingle_matrix)
# Perform Locality Sensitive Hashing
query_buckets = lsh_search_engine.lsh(signature_matrix)
return query_buckets
def perform_lsh():
"""
Uses LSH Search Engine Package functions to perform LSH on given dataset
Shingles -> Minhashing -> LSH
"""
# Create Shingle Matrix
shingles = lsh_search_engine.create_shingles_dataset()
shingle_matrix = lsh_search_engine.create_shingle_matrix(shingles)
# Create Signature Matrix by Minhashing
signature_matrix = lsh_search_engine.get_signature_matrix(shingle_matrix)
# Perform Locality Sensitive Hashing
docs_buckets = lsh_search_engine.lsh(signature_matrix)
return docs_buckets
def find_similar_docs(query_buckets, docs_buckets):
"""
Given the `docs_buckets` and the buckets `query_buckets` formed by the query
finds all the similar documents to the ones in `query_buckets` in `docs_buckets`.
"""
similar_docs = set()
for q_band_key in query_buckets.keys():
for q_bucket_idx, q_bucket_docs in query_buckets[q_band_key].items():
if q_bucket_docs:
if (
q_band_key in docs_buckets
and q_bucket_idx in docs_buckets[q_band_key]
):
similar_docs.update(docs_buckets[q_band_key][q_bucket_idx])
return similar_docs
# Uncomment to test find_similar_docs
# qb = {
# 1: {1: [0], 3: [0]},
# 2: {4: [0]},
# }
# db = {
# 1: {1: [0, 1, 2], 2: [3, 4]},
# 2: {4: [6]}
# }
# print(find_similar_docs(qb, db))