-
Notifications
You must be signed in to change notification settings - Fork 0
/
XMLParser.cpp
276 lines (212 loc) · 8.29 KB
/
XMLParser.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#include "XMLParser.h"
#include "porter2_stemmer.h"
#include "rapidxml/rapidxml.hpp"
#include <sstream>
#include <algorithm>
/*
Constructor for the XML parser, loads the stopwords
@param fileName - the name of the xml parser to be parsed
*/
XMLParser::XMLParser(string fileName){
is.open(fileName);
ifstream getStopWords("stopwords.txt");
string token = "";
while(token != "<-1>"){
getStopWords >> token;
for(int i = 0; i < token.size(); i++)
token[i] = tolower(token[i]);
stopWords.push_back(token);
}
}
/*
Checks if a specific token is XML junk by seing if it contains
any of the following characters
@param token - the string to be checked for garbage
@return - true if the token contains xml crap
*/
bool XMLParser::isXMLTag(string token){
return token.find("<") != string::npos
|| token.find(">") != string::npos
|| token.find("=") != string::npos
|| token.find("-") != string::npos
|| token.find(".") != string::npos
|| token.find(";") != string::npos
|| token.find(":") != string::npos
|| token.find("(") != string::npos
|| token.find(")") != string::npos
|| token.find("/") != string::npos
|| token.find("]") != string::npos
|| token.find("[") != string::npos
|| token.find(",") != string::npos
|| token.find("|") != string::npos
|| token.find("\'") != string::npos
|| token.find("}") != string::npos
|| token.find("{") != string::npos;
}
/*
Checks if a given token is a stop word
@param token - the word to compare to all the stop words
@return - true if the token is a stop word
*/
bool XMLParser::isStopWord(string token){
//send token to lower case
for(int i = 0; i < token.size(); i++){
token[i] = tolower(token[i]);
}
//Look for the word in the stopwords list using a binary search
return binary_search(stopWords.begin(), stopWords.end(), token);
}
/*
Reads a .xml file based on the fileName associated with this XMLParser into an IndexInterface
@param index - the IndexInterface to load the xml data into
@return - true if the method succeeded
bool XMLParser::readFile(IndexInterface* index){
//The first page of the dump is trash, this skips over it
skipIntroPage();
//count the number of files and print it to the screen
int num = 0;
//Write a doc to the index while we're not at the end of a file
while(!is.eof()){
num++;
cout << num << endl;
writeDocToIndex(index);
}
return true;
}*/
/*
Skips one page of the current xmlfile
@return - true if the method succeeded
bool XMLParser::skipIntroPage(){
string token = "";
while(token != "<page>") is >> token;
return true;
}*/
/*
Reads a file called fileName into an IndexInterface using rapidxml
@param fileName - the name of the file to index
@param index - the index to populate with information from the file
*/
bool XMLParser::readFileToIndex(string fileName, IndexInterface* index){
/*
This is a block of setup code for rapidxml so that I can parse through the xml file efficiently
*/
rapidxml::xml_document<> doc;
rapidxml::xml_node<> * root_node;
ifstream theFile (fileName);
vector<char> buffer((istreambuf_iterator<char>(theFile)), istreambuf_iterator<char>());
buffer.push_back('\0');
// Parse the buffer using the xml file parsing library into doc
doc.parse<0>(&buffer[0]);
// Find our root node
root_node = doc.first_node("mediawiki");
//Save the number of documents that have been indexed
int num = 0;
for (rapidxml::xml_node<> * document_node = root_node->first_node("page"); document_node; document_node = document_node->next_sibling()){
//The text element of the current document node
rapidxml::xml_node<> * text;
//Grab the text and document name of the current document
text = document_node->first_node("revision")->first_node("text");
string docName = document_node->first_node("title")->value();
//Ignore documents called user or file. They're garbage
if(docName.substr(0, 4) == "User" || docName.substr(0,4) == "File") continue;
//Find and save the author of the last revision
string docAuthor;
if(document_node->first_node("revision")->first_node("contributor") != nullptr
&& document_node->first_node("revision")->first_node("contributor")->first_node("username") != nullptr)
docAuthor = document_node->first_node("revision")->first_node("contributor")->first_node("username")->value();
else
docAuthor = "No author information given";
//Find and save the timestamp for the last revision
string timestamp;
if(document_node->first_node("revision")->first_node("timestamp") != nullptr)
timestamp = document_node->first_node("revision")->first_node("timestamp")->value();
else
timestamp = "No timestamp given";
string fileName = "SearchDocs/";
string docNameCopy = docName;
replace(docNameCopy.begin(), docNameCopy.end(), '/', '.');
replace(docNameCopy.begin(), docNameCopy.end(), ' ', '_');
fileName = fileName + docNameCopy;
ofstream writeDocFile(fileName);
//Add this document to the index and save it's index in the index number-name registry
int indexOfDoc = index->addDocument(docName, docAuthor, timestamp);
//If we've successfully found a text node for this document
if(text){
//Calculate file name for document file
docNameCopy = docName;
replace(docNameCopy.begin(), docNameCopy.end(), '/', ' ');
replace(docNameCopy.begin(), docNameCopy.end(), ':', '.');
stringstream titleStream(docNameCopy);
while(!titleStream.eof()){
//save the current word and add it to the index for the current document number
string word;
titleStream >> word;
writeDocFile << word << " ";
//Check if the word is a stop word or xml garbage
if(!isStopWord(word) && !isXMLTag(word)){
//If not, remove capitalization, stem it, and index it
word[0] = tolower(word[0]);
Porter2Stemmer::stem(word);
index->addWordForDocument(indexOfDoc, word);
}
}
//Make a stringstream of the text element of the document node
stringstream ss(text->value());
//Parse through the whole stringstream
while(!ss.eof()){
//save the current word and add it to the index for the current document number
string word;
ss >> word;
writeDocFile << word << " ";
//Check if the word is a stop word or xml garbage
if(!isStopWord(word) && !isXMLTag(word)){
//If not, remove capitalization, stem it, and index it
word[0] = tolower(word[0]);
Porter2Stemmer::stem(word);
index->addWordForDocument(indexOfDoc, word);
}
}
}
//close ofstream for this doc
writeDocFile.close();
cout << num++ << docName << endl;
}
return true;
}
/*
Write a single document from the xml file into the given index using my own ghetto xml parsing
@param index - the IndexInterface object to write to
@return - true if the method ran to completion and filled a document into the index
bool XMLParser::writeDocToIndex(IndexInterface* index){
//Token for tokenizing the entire file
string token = "";
//Move forward past the initial junk
while(token != "<page>" && !is.eof()) is >> token;
//If there's no pages, return false to indicate we haven't loaded anything
if(is.eof()) return false;
//skip a token
is >> token;
//Save the name of the doc
string docName;
getline(is, docName);
//Remove xml crap from the doc name
docName = token + docName;
docName = docName.substr(7, docName.size()-15);
//Skip user and file documents -- they're junk
if(docName.substr(0, 4) == "User" || docName.substr(0,4) == "File") return false;
//Save all the words in the file into this
vector<string> words;
//For each word in the section, stem the word, put it to lower case and copy it into the words arr
while(token != "</page>"){
for(int i = 0; i < token.size(); i++)
token[i] = tolower(token[i]);
//if(isNotXMLTag(token) && !isStopWord(token)){
Porter2Stemmer::stem(token);
words.push_back(token);
//}
is >> token;
}
//Add the document to the index
index->addDocument(docName, words);
return true;
}*/