-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
143 lines (122 loc) · 4.73 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from __future__ import division
import sys
import getopt
import os
import math
import pickle
import operator
from operator import itemgetter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import *
K = 10
bits_per_posting = 30
# For each line in query file, performs query and outputs result
def search_index(dictionary_file, postings_file, queries_file, output_file):
dictionary = pickle.load(open(dictionary_file, 'rb'))
postings_lists = open(postings_file, 'r')
queries_list = open(queries_file, 'r').read().split("\n")
search_results = open(output_file, 'w')
for index, query in enumerate(queries_list):
# in case blank line is caught as a query, write an empty line
if query == "":
search_results.write("\n")
else:
scores_dict = get_scores_dict(query, dictionary, postings_lists)
top_results = get_top_results(scores_dict)
search_results.write(stringify(top_results))
if index != len(queries_list) - 1:
search_results.write("\n")
postings_lists.close()
search_results.close()
def get_top_results(scores_dict):
tuples_list = [(doc_id, score) for doc_id, score in scores_dict.items()]
tuples_list.sort(key = operator.itemgetter(0))
tuples_list.sort(key = operator.itemgetter(1), reverse = True)
return tuples_list[:K]
def get_scores_dict(query_str, dictionary, postings_lists):
doc_count = dictionary["DOCUMENT_COUNT"]
doc_norm_factors = dictionary["DOCUMENT_NORM_FACTORS"]
scores_dict = {}
# query_norm_factor is used to compute final normalising values
query_norm_factor = 0
query_dict = process_query(query_str)
for term in query_dict:
if term in dictionary:
# query computations
df = dictionary[term][0]
idf = math.log10(doc_count / df)
query_tf = query_dict[term]
query_tf_wt = 1 + math.log10(query_tf)
query_wt = idf * query_tf_wt
query_norm_factor += math.pow(query_wt, 2)
# retrieving postings list for term
term_pointer = dictionary[term][1]
postings_list = get_postings_list(df, term_pointer, postings_lists)
# document computations
for posting in postings_list:
doc_id = posting[0]
doc_tf = posting[1]
doc_wt = 1 + math.log10(doc_tf)
score = query_wt * doc_wt
if doc_id in scores_dict:
scores_dict[doc_id] += score
else:
scores_dict[doc_id] = score
query_norm_factor = math.pow(query_norm_factor, 0.5)
return normalise(scores_dict, query_norm_factor, doc_norm_factors)
# Normalises scores dictionary
def normalise(scores_dict, query_norm_factor, doc_norm_factors):
for doc_id, score in scores_dict.iteritems():
norm_factor = doc_norm_factors[str(doc_id)] * query_norm_factor
scores_dict[doc_id] = score / norm_factor
return scores_dict
# Converts query string into dictionary form
def process_query(query_str):
stemmer = PorterStemmer()
query_list = query_str.split(" ")
query_dict = {}
for query in query_list:
query = stemmer.stem(query).lower()
if query in query_dict:
query_dict[query] += 1
else:
query_dict[query] = 1
return query_dict
# Seeks, loads and returns a postings list
def get_postings_list(df, term_pointer, postings_lists):
postings_lists.seek(term_pointer)
results_list = postings_lists.read(df * bits_per_posting - 1).strip().split()
results_list = map((lambda x: int(x, 2)), results_list)
tuples_list = []
for i in range(0, len(results_list), 2):
tuples_list.append((results_list[i], results_list[i + 1]))
return tuples_list
# Converts list of tuples to string for writing to output file
def stringify(list):
ans = ""
for element in list:
ans += str(element[0]) + " "
return ans.strip()
def usage():
print "usage: " + sys.argv[0] + " -d dictionary-file -p postings-file -q file-of-queries -o output-file-of-results"
input_file_d = input_file_p = input_file_q = output_file = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'd:p:q:o:')
except getopt.GetoptError, err:
usage()
sys.exit(2)
for o, a in opts:
if o == '-d':
input_file_d = a
elif o == '-p':
input_file_p = a
elif o == '-q':
input_file_q = a
elif o == '-o':
output_file = a
else:
assert False, "unhandled option"
if input_file_d is None or input_file_p is None or input_file_q is None or output_file is None:
usage()
sys.exit(2)
search_index(input_file_d, input_file_p, input_file_q, output_file)