-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.py
155 lines (118 loc) · 5.1 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/python3
import nltk
import sys
import getopt
import os
import pickle
import shutil
import sys
import collections
import math
# improvements:
# use temp directory
# standardise index and positingmap
def usage():
print("usage: " +
sys.argv[0] + " -i directory-of-documents -d dictionary-file -p postings-file")
def build_index(in_dir, out_dict, out_postings):
"""
build index from documents stored in the input directory,
then output the dictionary file and postings file
"""
print('indexing...')
# This is an empty method
# Pls implement your code in below
if not os.path.exists(in_dir):
print("ERROR: in dir not exist")
sys.exit(2)
inFiles = sorted(os.listdir(in_dir), key=lambda x: int(x))
# pre-process the documents, first by tokenising the sentences,
# then words, then apply porter stemming and then finally writitng the result
# of the procesed file into a dictery consisting of all the processed documents.
TMP_DIR = "processed"
stemmer = nltk.stem.PorterStemmer()
if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)
for file_name in inFiles:
with open(os.path.join(in_dir, file_name), 'r') as f:
contents = f.read()
contents = contents.lower() # case folding
sentences = nltk.tokenize.sent_tokenize(contents)
words = []
for sentence in sentences:
words.extend(nltk.tokenize.word_tokenize(sentence))
stemmed_words = [stemmer.stem(word) for word in words]
processed_words = ' '.join(stemmed_words)
# Write processed contents to tmp directory
with open(os.path.join(TMP_DIR, file_name), 'w') as f:
f.write(processed_words)
in_dir = TMP_DIR # change the input directory to the processed documents folder
inFiles = sorted(os.listdir(in_dir), key=lambda x: int(x))
documentLength = collections.defaultdict(lambda: 0)
index = collections.defaultdict(lambda: [])
document_term_weight_dict = collections.defaultdict(dict)
for inFile in inFiles:
with open(os.path.join(in_dir, inFile), "r", encoding="utf-8") as f:
termFreq = collections.defaultdict(lambda: 0)
for line in f:
words = line.split()
for word in words:
termFreq[word] += 1
documentLength[int(inFile)] += 1
square_val_list = []
for word in termFreq:
index[word].append([int(inFile), termFreq[word]])
document_term_weight = 1 + math.log(termFreq[word], 10)
document_term_weight_dict[int(inFile)][word] = document_term_weight
square_val_list.append(document_term_weight ** 2)
# Calculate the normalization factor for each term in the document
square_val_list.sort()
square_sum = sum(square_val_list)
document_normalization_factor = math.sqrt(square_sum)
# Update the value of each term in the document with the normalized factor
for key in document_term_weight_dict[int(inFile)]:
document_term_weight_dict[int(inFile)][key] /= document_normalization_factor
startIdx = 0
with open(out_postings, "wb") as postingsFile:
for word in index:
# pointer is in the form [start index, size of pickle-serialised linkedlist in bytes]
pointer = [startIdx, postingsFile.write(pickle.dumps(index[word]))]
# value is in the form of [document frequency, pointer]
documentFrequency = len(index[word])
index[word] = [documentFrequency, pointer]
# print(word, index[word][1][0], index[word][1][1])
startIdx += index[word][1][1]
with open(out_dict, "wb") as dictFile:
pickle.dump(dict(index), dictFile)
with open("docData.txt", "wb") as docData:
pickle.dump(dict(document_term_weight_dict), docData)
# cleanup
shutil.rmtree(TMP_DIR)
print("done")
# # pickle recursion exceeds limit otherwise
# # code taken from https://stackoverflow.com/questions/2134706/hitting-maximum-recursion-depth-using-pickle-cpickle
# def increaseRecursionLimit():
# max_rec = 0x100000
# # May segfault without this line. 0x100 is a guess at the size of each stack frame.
# resource.setrlimit(resource.RLIMIT_STACK, [0x100 * max_rec, resource.RLIM_INFINITY])
# sys.setrecursionlimit(max_rec)
input_directory = output_file_dictionary = output_file_postings = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'i:d:p:')
except getopt.GetoptError:
usage()
sys.exit(2)
for o, a in opts:
if o == '-i': # input directory
input_directory = a
elif o == '-d': # dictionary file
output_file_dictionary = a
elif o == '-p': # postings file
output_file_postings = a
else:
assert False, "unhandled option"
if input_directory == None or output_file_postings == None or output_file_dictionary == None:
usage()
sys.exit(2)
# increaseRecursionLimit()
build_index(input_directory, output_file_dictionary, output_file_postings)