-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
172 lines (136 loc) · 4.18 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Index module for IR system
Use the function run to load (or build) index file and print metadata.
"""
from future.utils import iteritems
from retrieve_data import *
import os
import pickle
import nltk
from nltk.corpus import stopwords
import pprint
import sys
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def _addToIndex(dataIndex, word, field, docId, pos):
""" Add word to index
Arguments:
(inout) dataIndex: the index (dict)
word: word to add
field: field where the word was found
docId: id of the document
pos: position in the field
"""
try:
wordInfo = dataIndex[word]
except KeyError:
wordInfo = {}
try:
fieldInfo = wordInfo[field]
except KeyError:
fieldInfo = {}
try:
docInfo = fieldInfo[docId]
except KeyError:
docInfo = []
docInfo.append(pos)
fieldInfo[docId] = docInfo
wordInfo[field] = fieldInfo
dataIndex[word] = wordInfo
def addToIndex(dataIndex, word, field, docId, pos):
""" Add word and its derivatives to index
Arguments:
(inout) dataIndex: the index (dict)
word: word to add
field: field where the word was found
docId: id of the document
pos: position in the field
"""
w = word.lower()
_addToIndex(dataIndex, word, field, docId, pos)
# lemmatization
lemma = wordnet_lemmatizer.lemmatize(w)
if lemma != w:
_addToIndex(dataIndex, lemma, field, docId, pos)
def index(data):
""" Builds the index from input data.
Argument:
List of documents
Returns:
dict of word -> dict of field -> dict of docId -> list of positions.
"""
special = list(u',.;:()[]$%^@!*{}+=&<>/"\'')
dataIndex = {}
docId = 0
for line in data:
for field in line.keys():
string = line[field]
tokens = tokenizeField(line[field])
pos = 0
for word in tokens:
if word not in special:
w = word.lower()
addToIndex(dataIndex, w, field, docId, pos)
pos += 1
docId += 1
return dataIndex
def printIndexMeta(dataIndex):
""" Print a summary of the index """
print("Number of words indexed: %d."%(len(dataIndex)))
def countFun(dataIndex, word):
""" Count the number of occurences of a word.
Return:
count
"""
if word not in dataIndex:
return 0
wordInfo = dataIndex[word]
cnt = 0
for field, fieldInfo in iteritems(wordInfo):
for docId, docInfo in iteritems(fieldInfo):
cnt += len(docInfo)
return cnt
def printIndexDebug(dataIndex):
""" Print index debug information. """
words = [key for key in dataIndex.keys() if len(key)>2]
histogram = sorted(words, key=lambda w: -countFun(dataIndex,w))
print("==Histogram==")
for i in range(len(histogram)):
print("%s: %d ocurrence(s)."%(histogram[i].encode('utf-8'),countFun(dataIndex,histogram[i])))
def createOrLoadIndex(data):
""" Load (build) the index from the list of documents"""
try:
print("Load index...")
dataIndex = loadIndex()
except IOError:
print("Build index...")
dataIndex = index(data)
pickle.dump( dataIndex , open( indexFile, "wb" ) )
return dataIndex
def loadIndex():
""" Load index from local file"""
if os.path.isfile(indexFile):
dataIndex = pickle.load( open( indexFile, "rb" ) )
else:
raise IOError("Index file does not exists!")
return dataIndex
def test():
"""Basic testing of module"""
print("Testing Index...")
deleteFiles([indexFile])
data = loadData()
dataIndex = run(data)
printIndexDebug(dataIndex)
def run(data):
dataIndex = createOrLoadIndex(data)
printIndexMeta(dataIndex)
return dataIndex
def tokenizeField(s):
""" Tokenize a field """
special = list('@=+-#%&*[]{}()?/"\';.,')
stop = stopwords.words("english")
s = s.replace('-',' - ');
lst = nltk.word_tokenize(s)
return [w for w in lst if w not in stop and w not in special]
if __name__ == "__main__":
test()