-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
179 lines (154 loc) · 7.04 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from __future__ import annotations
from os.path import dirname
from pandas import DataFrame
import bPlusTree
import lsh
import pandas
import matplotlib.pyplot as plt
# The maximum number of keys each node can hold
order = 8
# number of hash functions
hash_num = 100
# number of buckets (in a row of buckets array)
k = 40
# k for k-shingles
s_k = 5
MYDIR = dirname(__file__) # gives back your directory path
def main():
query_type = ""
dictionary = []
# to keep track of which text we are searching into
point = 0
print("\nPreprocessing...")
# get directory
path_of_docs = MYDIR + '/Datasets/corpus20090418/'
documents = bPlusTree.docs_to_search(path_of_docs)
# for every document in directory
for doc in documents[1]:
# open file
file = open(doc, "r", encoding="UTF-8", errors='ignore')
# read file
words = file.read()
# text to list
raw_dictionary = list(words.split())
# words preprocessing
doc_dictionary = [bPlusTree.Preprocessing(raw_dictionary), documents[0][point]]
point += 1
dictionary.append(doc_dictionary)
# create b+ tree
tree = bPlusTree.bplustree(dictionary, order)
# queries
while query_type != "r" and query_type != "e" and query_type != "q":
print("\nType 'r' for range query")
print("Type 'e' for exact query")
print("Type 'q' for exit")
query_type = input("Please select the type of the query:")
# sim_docs list contains:
# - in the preprocessing phase: only the documents names in which the query found
# - at the beginning of LSH (on each row): · the name of one documents in which the query is found
# · all the words of the document as one string variable
sim_docs = []
# if user selects "range query"
if query_type == "r":
l_bound = input("Please type the lower bound: ").lower()
u_bound = input("Please type the upper bound: ").lower()
# if not given in the correct order, then change the order
if l_bound > u_bound:
temp = u_bound
u_bound = l_bound
l_bound = temp
# search_results: list with all the documents for words in the given range
# contains duplicates
search_results = tree.retrieve(l_bound, u_bound, query_type)
# if non empty
if search_results:
# for each row of search_results
for word in range(len(search_results)):
# for each document in document list of this row
for doc_list in range(len(search_results[word][1])):
# check for duplicates
if search_results[word][1][doc_list] not in sim_docs:
# append only unique documents in list
sim_docs.append(search_results[word][1][doc_list])
print("sim_docs", sim_docs)
query_type = ""
# if user selects "exact query"
elif query_type == 'e':
l_bound = input("Please type the word you are looking for: ").lower()
u_bound = l_bound
search_results = tree.retrieve(l_bound, u_bound, query_type)
# if non empty
if search_results:
# for each row (document name) of search_results
for i, word in enumerate(search_results):
# check for duplicates
if word not in sim_docs:
# append only unique documents in list
sim_docs.append(search_results[i])
query_type = ""
elif query_type == 'q':
return
# if keywords found only in one document
if len(sim_docs) == 1:
print("Only 1 document found. We cannot perform LSH algorithm for similarities.")
# skip this while loop and print menu
continue
if sim_docs:
for i, text in enumerate(sim_docs):
for d_row in dictionary:
if text == d_row[1]:
joined_row = ' '.join(d_row[0])
sim_docs[i] = [sim_docs[i], joined_row]
# read documents, preprocessing, hashed shingles creation
hashed_shingles, docs = lsh.create_shingles(sim_docs, s_k)
# create input matrix with 0/1 for hashed shingles and documents
inp_mtrx = lsh.create_input_matrix(hashed_shingles, docs)
# create signature matrix given the input matrix
sign_mtrx = lsh.minHash(inp_mtrx, docs, hash_num)
# for r in range(len(sign_mtrx)):
# print("sign_mtrx:", r, "-->", sign_mtrx[r])
# create bands given the signature matrix
bands = lsh.create_bands(sign_mtrx)
# print("bands: ")
# for r in range(len(bands)):
# print(r, "-->", end='')
# for s_r in range(len(bands[r][1])):
# if s_r == 0:
# print(bands[r][1][s_r])
# else:
# print("\t", bands[r][1][s_r])
# create buckets given the bands
buckets = lsh.create_hash_table(bands, k)
# for r in range(len(buckets)):
# print("buckets:", r, "-->", buckets[r])
# find candidate pairs
cand_pairs = lsh.candidate_column_pairs(buckets)
# for r in range(len(cand_pairs)):
# print("cand_pairs:", r, "-->", cand_pairs[r])
# calculate pairs similarities (Jaccard similarity)
similarities = lsh.document_similarities(cand_pairs, sign_mtrx, docs)
# DataFrame for similarities
similarities_df = DataFrame(similarities, columns=['First_document', 'Second_document', 'Similarity_with_signatures'])
# option to print all rows and columns of DataFrame
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)
pandas.set_option('display.width', None)
word_js = []
for r in range(len(similarities)):
for d_r, d_i in enumerate(dictionary):
if dictionary[d_r][1] == similarities[r][0]:
l1_list = dictionary[d_r][0]
if dictionary[d_r][1] == similarities[r][1]:
l2_list = dictionary[d_r][0]
word_js.append(lsh.jaccard_similarity(l1_list, l2_list))
similarities_df.insert(3, 'Similarity_with_words', word_js, True)
print(similarities_df)
# plot chart
ax = plt.gca()
plt.xlabel('Number of document pair')
plt.ylabel('similarity')
similarities_df.plot(kind='line', y='Similarity_with_signatures', ax=ax)
similarities_df.plot(kind='line', y='Similarity_with_words', color='red', ax=ax)
plt.show()
if __name__ == "__main__":
main()