-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathstr2node_string_symbol.cpp
91 lines (86 loc) · 2.12 KB
/
str2node_string_symbol.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include <vector>
#include <string>
#include <iostream>
#include "common_string_symbol.h"
void str2node (const char *str, std::vector <stx::string_symbol>& doc, int token_type)
{
unsigned int len = strlen (str);
bool at_space = false;
std::string unigram = "";
for (unsigned int pos = 0; pos < len; ++pos) {
// Skip white spaces. They are not considered as unigrams.
if (isspace(str[pos])) {
at_space = true;
continue;
}
// If word level tokens.
if (!token_type) {
if (at_space || pos == 0) {
at_space = false;
if (!unigram.empty()) {
doc.push_back(unigram);
unigram.clear();
}
unigram += str[pos];
} else {
unigram += str[pos];
}
} else {
// Char (i.e. byte) level token.
unigram = str[pos];
doc.push_back(unigram);
unigram.clear();
}
}
if (!token_type) {
if (!unigram.empty()) {
doc.push_back(unigram);
unigram.clear();
}
}
}
//void str2node (const char *str, std::vector <stx::string_symbol>& doc, int token_type)
//{
// try {
//
// //unsigned int size = 0;
// unsigned int len = strlen (str);
// std::string buf = "";
// //char prev_char;
//
// for (unsigned int i = 0; i < len; i++) {
//// if (i > 0) {
//// prev_char = str[i - 1];
//// } else {
//// prev_char = str[i];
//// }
// if (str[i] == '(' || str[i] == ')') {
// if (! buf.empty()) {
// doc.push_back (buf);
// //std::cout << doc[doc.size() - 1] << " ";
// buf = "";
// //++size;
// }
// }
// else {
// if (str[i] == '\t' || str[i] == ' ') { // do nothing
// // if (str[i] == '\t') { // do nothing
// } else {
// //if (prev_char == ' ') { //do nothing
// //} else {
// buf += str[i];
// }
// }
// }
// //std::cout << "\n";
// //std::cout << "bf size: " << buf.size();
// if (!buf.empty()) {// && !isspace(buf[buf.size() - 1])) {
// throw 2;
// }
//
// return;
// } catch (const int) {
// std::cerr << "Fatal: parse error << [" << str << "]\n";
// std::exit (-1);
// }
//}