forked from lnthach/SAX-SEQL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSNode.cpp
165 lines (145 loc) · 4.19 KB
/
SNode.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/**
* Author: Severin Gsponer ([email protected])
*
* SNode: represents a node in a searchtree for SEQL
*
* License:
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation.
*
*/
#include "SNode.h"
// Shrink the list of total occurrences to contain just support doc_ids, instead
// of doc_ids and occurences.
void SNode::shrink() {
std::vector<int> tmp;
std::vector<double> tmpd;
//for (auto const &currLoc : loc) {
// if (currLoc < 0) {
// tmp.push_back(currLoc);
// }
//}
for (int i = 0; i < loc.size(); i++) {
auto const &currLoc = loc[i];
if (currLoc < 0) {
tmp.push_back(currLoc);
tmpd.push_back(dist[i]);
}
}
// only retain the occurence with the minimum distance for each document
// double min_dist = 1001;
// for (auto const &currDist : dist) {
// if (currDist < 0) {
// if (min_dist == 1000){
// tmpd.push_back(-min_dist);
// min_dist = 1000;
// }
// } else {
// min_dist = currDist < min_dist ? currDist : min_dist;
// }
// }
// tmpd.push_back(-min_dist); // push the min dist of the last occurrence
loc = std::move(tmp);
dist = std::move(tmpd);
loc.shrink_to_fit();
dist.shrink_to_fit();
// Does not shrink the capacity of the location erase remove idome
// loc.erase( std::remove_if(loc.begin(), loc.end(), [](int i){return i >= // 0;}),loc.end());
last_docid = -1;
last_doc_pos = -1;
}
// Return the support of current ngram.
// Simply count the negative loc as doc_ids.
unsigned int SNode::support() const {
return std::count_if(begin(loc), end(loc),
[](int currLoc) { return currLoc < 0; });
}
std::vector<int> SNode::getLoc() { return loc; }
std::string SNode::getNgram() {
std::string ngram = "";
if (!tokenType) { // If word-level token: a bb cd a bb
for (SNode *t = this; t!=nullptr; t = t->prev) {
ngram = " " + t->ne + ngram;
}
// skip the space in front of the ngram
ngram.assign(ngram.substr(1));
} else { // char-level tokens: abbcdabb
for (SNode *t = this; t!=nullptr; t = t->prev) {
ngram = t->ne + ngram;
}
}
return ngram;
}
bool SNode::violateWildcardConstraint() {
int numberOfWildcards = 0;
int numberOfConsecWildcards = 0 ;
for (SNode *t = this; t != nullptr; t = t->prev) {
if (t->ne.compare("*") == 0) {
numberOfWildcards++;
numberOfConsecWildcards++;
if (numberOfWildcards > totalWildcardLimit) {
return true;
}
}else{
if (numberOfConsecWildcards > consecWildcardLimit){
return true;
}
numberOfConsecWildcards = 0;
}
}
return false;
}
void SNode::setupWildcardConstraint(int _totalWildcardLimit,
int _consecWildcardLimit) {
if (_totalWildcardLimit == 0) {
if (_consecWildcardLimit == 0) {
hasWildcardConstraints = false;
} else {
hasWildcardConstraints = true;
consecWildcardLimit = _consecWildcardLimit;
totalWildcardLimit = std::numeric_limits<int>::max();
}
}else{
hasWildcardConstraints = true;
if(_consecWildcardLimit == 0 || _consecWildcardLimit > _totalWildcardLimit) {
totalWildcardLimit = _totalWildcardLimit;
consecWildcardLimit = totalWildcardLimit;
}
else{
totalWildcardLimit = _totalWildcardLimit;
consecWildcardLimit = _consecWildcardLimit;
}
}
}
// Add a doc_id and position of occurrence to the total list of occurrences,
// for this ngram.
// Encode the doc_id in the vector of locations.
// Negative entry means new doc_id.
void SNode::add(unsigned int docid, int pos) {
if (last_docid != static_cast<int>(docid)) {
loc.push_back(-static_cast<int>(docid + 1));
}
loc.push_back(pos);
last_docid = static_cast<int>(docid);
}
void SNode::add(unsigned int docid, int pos, double distance) {
if (last_docid != static_cast<int>(docid)) {
loc.push_back(-static_cast<int>(docid + 1));
dist.push_back(distance);
last_doc_pos = loc.size() - 1;
}
loc.push_back(pos);
dist.push_back(distance);
last_docid = static_cast<int>(docid);
if (distance < dist[last_doc_pos]){
dist[last_doc_pos] = distance;
}
}
SNode::~SNode(){
// std::cout << "Deconstr"<< std::endl;
};
bool SNode::tokenType = true;
bool SNode::hasWildcardConstraints = true;
int SNode::totalWildcardLimit = 0;
int SNode::consecWildcardLimit = 0;