-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.cpp
163 lines (141 loc) · 3.89 KB
/
indexer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#include"indexer.h"
#include"word_tokenizer.h"
#include"stopword.h"
#include "math.h"
#include<map>
using namespace std;
typedef map<string, map<index_item*, tuple<int, double>>> wordIndexType;
const string SW_FILE = "stopwords.txt";
/**
Default Constructor
*/
indexer::indexer()
{}
/**
Destructor that deletes the index_item pointers from the index list
*/
indexer::~indexer()
{
for (vector<index_item*>::iterator it = indexList.begin();
it != indexList.end();
++it)
{
delete *it;
}
}
/**
Function that returns the number of documents inside the indexer
*/
const int indexer::size() const
{
return indexList.size();
}
/**
Function normalize that computes the tf-idf weights
*/
void indexer::normalize()
{
double N = static_cast<double>(size()); //Total # of documents
for (wordIndexType::iterator it = wordIndex.begin();
it != wordIndex.end();
++it)
{
double df = static_cast<double>(it->second.size()); //Document Frequency
for (map<index_item*, tuple<int, double>>::iterator itt = it->second.begin();
itt != it->second.end();
++itt)
{
double tf = static_cast<double>(get<0>(itt->second)); //Term Frequency
get<1>(itt->second) = (1 + log(tf))*log(N / df); //Set tf_idf weight in the map
}
}
}
/**
Function that reads in an index item and adds its content to the wordIndex map
*/
void indexer::operator>>(index_item * item)
{
indexList.push_back(item);
word_tokenizer tk(item->content());
string word;
while (tk.hasNextToken())
{
word = tk.nextToken();
for (string::size_type i = 0; i != word.length(); ++i)
word[i] = tolower(word[i]); //Convert characters to lower case
++get<0>(wordIndex[word][item]); //Increment tf
}
normalize();
}
/**
Function that provides access to the indexed files
*/
index_item * indexer::operator[](int i) const
{
return indexList[i];
}
/**
Function that returns a normalized query vector from a given string @param query
*/
map<string, double> indexer::getNormalizedQuery(string query) const
{
double N = size(); //# of documents/sentences
double df;
word_tokenizer tk(query);
Stopword sw(SW_FILE);
map<string, double> termWeight;
string word;
while (tk.hasNextToken())
{
word = tk.nextToken();
for (string::size_type i = 0; i != word.length(); ++i)
word[i] = tolower(word[i]); //Convert characters to lower case
if (!sw(word) && wordIndex.count(word) == 1) /*Ignore stopwords and words not contained in the index*/
++termWeight[word]; //Temporarily use double value to store the term frequency in the query
}
for (map<string, double>::iterator it = termWeight.begin();
it != termWeight.end();
++it)
{
df = wordIndex.at(it->first).size(); //# of documents/sentences the word appears in
it->second = (1 + log(it->second))*log(N / df); //Set tf-idf weight where the tf was stored
}
return termWeight;
}
/**
Function that computes the cosine similarity between the query vector and the indexed items
*/
std::vector<query_result> indexer::cosineSimilarity(std::map<string, double> termWeight) const
{
vector<query_result> results;
double num, qwMagnitude, iwMagnitude, score, weight_q, weight_d;
string word;
index_item* doc;
//Formula is num / (qwMagnitude * iwMagnitude)
for (int i = 0; i != size(); ++i)
{ /*For each document/sentence...*/
num = 0;
qwMagnitude = 0;
iwMagnitude = 0;
doc = (*this)[i];
for (map<string, double>::const_iterator it = termWeight.begin();
it != termWeight.end();
++it) /*For each term in the query...*/
{
word = it->first;
weight_q = it->second;
qwMagnitude += pow(weight_q, 2);
if (wordIndex.at(word).count(doc) == 0)
continue;
weight_d = get<1>(wordIndex.at(word).at(doc));
iwMagnitude += pow(weight_d, 2);
num += weight_q * weight_d;
}
if (qwMagnitude == 0 || iwMagnitude == 0) /*Don't divide by 0*/
score = 0;
else
score = num / sqrt(qwMagnitude) / sqrt(iwMagnitude);
results.push_back(query_result((*this)[i], score));
}
return results;
}