forked from lnthach/SAX-SEQL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseql_learn.cpp
1462 lines (1255 loc) · 51.4 KB
/
seql_learn.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Author: Georgiana Ifrim ([email protected])
* SEQL: Sequence Learner
* This library trains ElasticNet-regularized Logistic Regression and L2-loss (squared-hinge-loss) SVM for Classifying Sequences in the feature space of all possible
* subsequences in the given training set.
* Elastic Net regularizer: alpha * L1 + (1 - alpha) * L2, which combines L1 and L2 penalty effects. L1 influences the sparsity of the model, L2 corrects potentially high
* coeficients resulting due to feature correlation (see Regularization Paths for Generalized Linear Models via Coordinate Descent, by Friedman et al, 2010).
*
* The user can influence the outcome classification model by specifying the following parameters:
* [-o objective] (objective function; choice between logistic regression, squared-hinge-svm and squared error. By default: logistic regression.)
* [-T maxitr] (number of optimization iterations; by default this is set using a convergence threshold on the aggregated change in score predictions.)
* [-l minpat] (constraint on the min length of any feature)
* [-L maxpat] (constraint on the max length of any feature)
* [-m minsup] (constraint on the min support of any feature, i.e. number of sequences containing the feature)
* [-g maxgap] (number of total wildcards allowed in a feature, e.g. a**b, is a feature of size 4 with any 2 characters in the middle)
* [-G maxcongap] (number of consecutive wildcards allowed in a feature, e.g. a**b, is a feature of size 4 with any 2 characters in the middle)
* [-n token_type] (word or character-level token to allow sequences such as 'ab cd ab' and 'abcdab')
* [-C regularizer_value] value of the regularization parameter, the higher value means more regularization
* [-a alpha] (weight on L1 vs L2 regularizer, alpha=0.5 means equal weight for l1 and l2)
* [-r traversal_strategy] (BFS or DFS traversal of the search space), BFS by default
* [-c convergence_threshold] (stopping threshold for optimisation iterations based on change in aggregated score predictions)
* [-v verbosity] (amount of printed detail about the model)
*
* License:
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation.
*
*/
/* The obj fct is: loss(x, y, beta) + C * ElasticNetReg(alpha, beta).
*/
#include <cfloat>
#include <iostream>
#include <cmath>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <algorithm>
#include <numeric>
#include <functional>
#include <map>
#include <set>
#include <iterator>
#include <cstdlib>
#include <cstring>
#include <unistd.h>
#include "common.h"
#include "sys/time.h"
#include <list>
#include "SNode.h"
#include "seql_learn.h"
using namespace std;
SeqLearner::SeqLearner(){
max_char_difference = 1;
data_read = false;
}
bool SeqLearner::external_read (std::vector<std::string>& data) {
// Set the max line/document size to (10Mb).
// const int kMaxLineSize = 10000000;
// char *line = new char[kMaxLineSize];
char *column[5];
unsigned int num_pos = 0;
unsigned int num_neg = 0;
int line_count = 0;
string doc;
cout << "\n\nread() input data....";
for (int i = 0; i < data.size(); i++){
line_count++;
if (skip_items.find(line_count) != skip_items.end()) continue;
char * line = new char[data[i].size() + 1];
std::copy(data[i].begin(), data[i].end(), line);
line[data[i].size()] = '\0'; // don't forget the terminating 0
if (2 != tokenize (line, "\t ", column, 2)) {
std::cerr << "FATAL: Format Error: " << line << std::endl;
return false;
}
// Prepare class. _y is +1/-1.
double _y = atof (column[0]);
y.push_back (_y);
if (_y > 0) num_pos++;
else num_neg++;
// Prepare doc. Assumes column[1] is the original text, e.g. no bracketing of original doc.
doc.assign(column[1]);
transaction.push_back(doc);
//cout << "\nscanning docid: " << transaction.size() - 1 << " class y:" << _y << " doc:" << doc <<"*";
cout.flush();
}
cout << "\n# positive samples: " << num_pos;
cout << "\n# negative samples: " << num_neg;
cout << "\nend read() input data....";
data_read = true;
return true;
}
bool SeqLearner::external_read (std::vector < std::string >& _transaction, std::vector < double >& _y){
// to reuse seqlearner for another training
transaction.clear();
y.clear();
for (int i = 0; i < _transaction.size();i++){
if (skip_items.find(i+1) != skip_items.end()) continue;
transaction.push_back(_transaction[i]);
y.push_back(_y[i]);
}
data_read = true;
return true;
}
void SeqLearner::add_skips_items(int item){
skip_items.insert(item);
}
// Read the input training documents, "true_class document" per line.
// A line in the training file can be: "+1 a b c"
bool SeqLearner::read (const char *filename) {
//
if (data_read){
return true;
}
// Set the max line/document size to (10Mb).
const int kMaxLineSize = 10000000;
char *line = new char[kMaxLineSize];
char *column[5];
unsigned int num_pos = 0;
unsigned int num_neg = 0;
int line_count = 0;
string doc;
std::istream *ifs = 0;
if (!strcmp(filename, "-")) {
ifs = &std::cin;
} else {
ifs = new std::ifstream(filename);
if (!*ifs) {
std::cerr << "seql_learn" << " " << filename << " No such file or directory" << std::endl;
return false;
}
}
if (! ifs) return false;
cout << "\n\nread() input data....";
while (ifs->getline (line, kMaxLineSize)) {
line_count++;
if (skip_items.find(line_count) != skip_items.end()) continue;
if (line[0] == '\0' || line[0] == ';') continue;
if (line[strlen(line) - 1 ] == '\r')
line[strlen(line) - 1 ] = '\0';
//cout << "\nline size: " << strlen(line);
//cout.flush();
if (2 != tokenize (line, "\t ", column, 2)) {
std::cerr << "FATAL: Format Error: " << line << std::endl;
return false;
}
// Prepare class. _y is +1/-1.
double _y = atof (column[0]);
y.push_back (_y);
if (_y > 0) num_pos++;
else num_neg++;
// Prepare doc. Assumes column[1] is the original text, e.g. no bracketing of original doc.
doc.assign(column[1]);
transaction.push_back(doc);
//cout << "\nscanning docid: " << transaction.size() - 1 << " class y:" << _y << " doc:" << doc <<"*";
cout.flush();
}
cout << "\n# positive samples: " << num_pos;
cout << "\n# negative samples: " << num_neg;
cout << "\nend read() input data....";
delete [] line;
data_read = true;
return true;
}
// For current ngram, compute the gradient value and check prunning conditions.
// Update the current optimal ngram.
bool SeqLearner::can_prune_and_update_rule (rule_t& rule, SNode *space, unsigned int size) {
++total;
// Upper bound for the positive class.
double upos = 0;
// Upper bound for the negative class.
double uneg = 0;
// Gradient value at current ngram.
double gradient = 0;
// Support of current ngram.
unsigned int support = 0;
//string reversed_ngram;
string ngram;
std::vector <int>& loc = space->loc;
std::vector <double>& dist = space->dist;
//variable for y-beta^t*x (Squared loss)
double y_btx=0;
//variable for 1-y*beta^t*x (l2-svm)
double ymbtx=0;
// Compute the gradient and the upper bound on gradient of extensions.
for (unsigned int i = 0; i < loc.size(); ++i) {
if (loc[i] >= 0) continue;
++support;
unsigned int j = (unsigned int)(-loc[i]) - 1;
// calculate xj
double distance = dist[i];
double xj = compute_xi(distance);
// Choose objective function. 0: SLR, 2: l2-SVM 3: Squared loss.
if (objective == 0) { //SLR
// From differentiation we get a - in front of the sum_i_to_N
gradient -= y[j] * exp_fraction[j] * xj; // TODO : xi
if (y[j] > 0) {
upos -= y[j] * exp_fraction[j] * xj; // TODO : xi
} else {
uneg -= y[j] * exp_fraction[j] * xj; // TODO : xi
}
} //end SLR (logistic regression)
else if (objective == 2) { //L2-SVM
ymbtx = 1-y[j]*sum_best_beta[j];
if (ymbtx > 0) {
gradient -= 2 * y[j] * (ymbtx);
if (y[j] > 0) {
upos -= 2 * y[j] * (ymbtx);
} else {
uneg -= 2 * y[j] * (ymbtx);
}
}
} //end l2-SVM
else if (objective == 3) { //Squared loss
y_btx=y[j]-sum_best_beta[j];
gradient -= 2 * (y_btx);
if (y_btx > 0) {
upos -= 2 * (y_btx);
} else {
uneg -= 2 * (y_btx);
}
} //end Squared loss
}
// Assume we can prune until bound sais otherwise.
bool flag_can_prune = 1;
// In case C != 0 we need to update the gradient and check the exact bound
// for non-zero features.
if ( C != 0 ) {
ngram = space->getNgram();
if (verbosity > 3) {
cout << "\ncurrent ngram rule: " << ngram;
cout << "\nlocation size: " << space->loc.size() << "\n";
cout << "\ngradient (before regularizer): " << gradient;
cout << "\nupos (before regularizer): " << upos;
cout << "\nuneg (before regularizer): " << uneg;
cout << "\ntau: " << tau;
}
double current_upos = 0;
double current_uneg = 0;
// Retrieve the beta_ij coeficient of this feature. If beta_ij is non-zero,
// update the gradient: gradient += C * [alpha*sign(beta_j) + (1-alpha)*beta_j];
// Fct lower_bound return an iterator to the key >= given key.
features_it = features_cache.lower_bound(ngram);
// If there are keys starting with this prefix (this ngram starts at pos 0 in existing feature).
if (features_it != features_cache.end() && features_it->first.find(ngram) == 0) {
// If found an exact match for the key.
// add regularizer to gradient.
if (features_it->first.compare(ngram) == 0) {
int sign = abs(features_it->second)/features_it->second;
gradient += C * (alpha * sign + (1-alpha) * features_it->second);
}
if (verbosity > 3) {
cout << "\ngradient after regularizer: " << gradient;
}
// Check if current feature s_j is a prefix of any non-zero features s_j'.
// Check exact bound for every such non-zero feature.
while (features_it != features_cache.end() && features_it->first.find(ngram) == 0) {
int sign = abs(features_it->second)/features_it->second;
current_upos = upos + C * (alpha * sign + (1-alpha) * features_it->second);
current_uneg = uneg + C * (alpha * sign + (1-alpha) * features_it->second);
if (verbosity > 3) {
cout << "\nexisting feature starting with current ngram rule prefix: "
<< features_it->first << ", " << features_it->second << ", sign: " << sign;
cout << "\ncurrent_upos: " << current_upos;
cout << "\ncurrent_uneg: " << current_uneg;
cout << "\ntau: " << tau;
}
// Check bound. If any non-zero feature starting with current ngram as a prefix
// can still qualify for selection in the model,
// we cannot prune the search space.
if (std::max (abs(current_upos), abs(current_uneg)) > tau ) {
flag_can_prune = 0;
break;
}
++features_it;
}
} else {
// If there are no features in the model starting with this prefix, check regular bound.
if (std::max (abs(upos), abs(uneg)) > tau ) {
flag_can_prune = 0;
}
}
} // If C = 0, check regular bound.
else {
if (std::max (abs(upos), abs(uneg)) > tau ) {
flag_can_prune = 0;
}
}
if (support < minsup || flag_can_prune) {
++pruned;
if (verbosity > 3) {
cout << "\n\nminsup || upper bound: pruned!\n";
}
return true;
}
double g = std::abs (gradient);
// If current ngram better than previous best ngram, update optimal ngram.
// Check min length requirement.
if (g > tau && size >= minpat) {
++rewritten;
tau = g;
rule.gradient = gradient;
rule.size = size;
if (C == 0) { // Retrieve the best ngram. If C != 0 this is already done.
ngram = space->getNgram();
} //end C==0.
rule.ngram = ngram;
if (verbosity >= 3) {
cout << "\n\nnew current best ngram rule: " << rule.ngram;
cout << "\ngradient: " << gradient << "\n";
}
rule.loc.clear ();
rule.dist.clear();
for (unsigned int i = 0; i < space->loc.size(); ++i) {
// Keep the doc ids where the best ngram appears.
if (space->loc[i] < 0) {
rule.loc.push_back ((unsigned int)(-space->loc[i]) - 1);
rule.dist.push_back (space->dist[i]);
}
}
}
return false;
}
// Try to grow the ngram to next level, and prune the appropriate extensions.
// The growth is done breadth-first, e.g. grow all unigrams to bi-grams, than all bi-grams to tri-grams, etc.
void SeqLearner::span_bfs (rule_t& rule, SNode *space, std::vector<SNode *>& new_space, unsigned int size) {
std::vector <SNode *>& next = space->next;
// If working with gaps.
// Check if number of consecutive gaps exceeds the max allowed.
if(SNode::hasWildcardConstraints){
if(space->violateWildcardConstraint()) return;
}
if (!(next.size() == 1 && next[0] == 0)) {
// If there are candidates for extension, iterate through them, and try to prune some.
if (! next.empty()) {
if (verbosity > 4)
cout << "\n !next.empty()";
for (auto const &next_space : next) {
// If the last token is a gap, skip checking gradient and pruning bound, since this is the same as for the prev ngram without the gap token.
// E.g., if we checked the gradient and bounds for "a" and didnt prune it, then the gradient and bounds for "a*" will be the same,
// so we can safely skip recomputing the gradient and bounds.
if (next_space->ne.compare("*") == 0) {
if (verbosity > 4)
cout << "\nFeature ends in *, skipping gradient and bound computation.";
new_space.push_back (next_space);
} else if (! can_prune_and_update_rule (rule, next_space, size)) {
new_space.push_back (next_space);
}
}
} else {
// Candidates obtained by extension.
std::map<string, SNode> candidates;
createCandidatesExpansions(space, candidates);
// Keep only doc_ids for occurrences of current ngram.
space->shrink ();
if (candidates.size() == 0){
next.push_back (0);
}else{
next.reserve(candidates.size());
next.clear();
// Prepare the candidate extensions.
for (auto const &currCandidate : candidates) {
auto c = new SNode;
c->loc = currCandidate.second.loc;
c->dist = currCandidate.second.dist;
c->ne = currCandidate.first;
c->prev = space;
c->next.clear ();
// Keep all the extensions of the current feature for future iterations.
// If we need to save memory we could sacrifice this storage.
next.push_back(c);
// If the last token is a gap, skip checking gradient and pruning bound, since this is the same as for ngram without last gap token.
// E.g., if we checked the gradient and bounds for "a" and didnt prune it, then the gradient and bounds for "a*" will be the same,
// so we can safely skip recomputing the gradient and bounds.
if (c->ne.compare("*") == 0) {
if (verbosity > 3)
cout << "\nFeature ends in *, skipping gradient and bound computation. Extending to next bfs level.";
new_space.push_back (c);
} else if (! can_prune_and_update_rule(rule, c, size)) {
new_space.push_back (c);
}
}
}
// Adjust capacity of next vector
std::vector<SNode *>(next).swap (next);
} //end generation of candidates when they weren't stored already.
}
}
void SeqLearner::createCandidatesExpansions(SNode* space, std::map<string, SNode>& candidates){
// Prepare possible extensions.
unsigned int docid = 0;
std::vector <int>& loc = space->loc;
std::vector <double>& dist = space->dist;
// Iterate through the inverted index of the current feature.
for (int i = 0; i < loc.size(); i++)
{
auto const& currLoc = loc[i];
//for (auto const& currLoc : loc) {
// If current Location is negative it indicates a document rather than a possition in a document.
if (currLoc < 0) {
docid = (unsigned int)(-currLoc) - 1;
continue;
}
// current unigram where extension is done
string unigram = space->ne;
if (verbosity > 4) {
cout << "\ncurrent pos and start char: " << currLoc << " " << transaction[docid][currLoc];
cout << "\ncurrent unigram to be extended (space->ne):" << unigram;
}
string next_unigram;
// If not re-initialized, it should fail.
unsigned int pos_start_next_unigram = transaction[docid].size();
if (currLoc + unigram.size() < transaction[docid].size()) { //unigram is not in the end of doc, thus it can be extended.
if (verbosity > 4) {
cout << "\npotential extension...";
}
if (!SNode::tokenType) { // Word level token.
// Find the first space after current unigram position.
unsigned int pos_space = currLoc + unigram.size();
// Skip over consecutive spaces.
while ( (pos_space < transaction[docid].size()) && isspace(transaction[docid][pos_space]) ) {
pos_space++;
}
// Check if doc ends in spaces, rather than a unigram.
if (pos_space == transaction[docid].size()) {
//cout <<"\ndocument with docid" << docid << " ends in (consecutive) space(s)...move to next doc";
//std::exit(-1);
continue;
} else {
pos_start_next_unigram = pos_space; //stoped at first non-space after consec spaces
size_t pos_next_space = transaction[docid].find(' ', pos_start_next_unigram + 1);
// Case1: the next unigram is in the end of the doc, no second space found.
if (pos_next_space == string::npos) {
next_unigram.assign(transaction[docid].substr(pos_start_next_unigram));
} else { //Case2: the next unigram is inside the doc.
next_unigram.assign(transaction[docid].substr(pos_start_next_unigram, pos_next_space - pos_start_next_unigram));
}
}
} else { // Char level token. Skip spaces.
if (!isspace(transaction[docid][currLoc + 1])) {
//cout << "\nnext char is not space";
next_unigram = transaction[docid][currLoc + 1]; //next unigram is next non-space char
pos_start_next_unigram = currLoc + 1;
} else { // If next char is space.
unsigned int pos_space = currLoc + 1;
// Skip consecutive spaces.
while ((pos_space < transaction[docid].size()) && isspace(transaction[docid][pos_space])) {
pos_space++;
}
// Check if doc ends in space, rather than a unigram.
if (pos_space == transaction[docid].size()) {
//cout <<"\ndocument with docid" << docid << " ends in (consecutive) space(s)...move to next doc";
//std::exit(-1);
continue;
}
/* //disallow using char-tokenization for space separated tokens.
else {
pos_start_next_unigram = pos_space;
//cout << "\nnext next char is not space";
next_unigram = transaction[docid][pos_start_next_unigram];
} */
}
} //end char level token
if (next_unigram.empty()) {
//@THACH: just carry on if next unigram is empty
if (verbosity > 4){
cout <<"\nIn expansion for next_unigram: expansion of current unigram " << unigram << " is empty...continue";
}
continue;
}
if (verbosity > 4) {
cout << "\nnext_unigram for extension:" << next_unigram;
//cout << transaction[docid][pos_start_next_unigram];
cout << "\npos: " << pos_start_next_unigram << " " << transaction[docid][pos_start_next_unigram];
}
if (minsup == 1 || single_node_minsup_cache.find (next_unigram) != single_node_minsup_cache.end()) {
// candidates[next_unigram].add (docid, pos_start_next_unigram);
// looks for unigram within the range of allowed distance
for (const string similar_candidate:alphabet){
double new_dist = get_distance(similar_candidate,next_unigram);
double candidate_dist = dist[i] + new_dist;
if(new_dist <= 1.0 && candidate_dist <= max_distance){
candidates[similar_candidate].add (docid, pos_start_next_unigram, candidate_dist);
}
}
}
if (SNode::hasWildcardConstraints) {
// If we allow gaps, we treat a gap as an additional unigram "*".
// Its inverted index will simply be a copy pf the inverted index of the original features.
// Example, for original feature "a", we extend to "a *", where inverted index of "a *" is simply
// a copy of the inverted index of "a", except for positions where "a" is the last char in the doc.
candidates["*"].add (docid, pos_start_next_unigram);
}
} //end generating candidates for the current pos
} //end iteration through inverted index (docids iand pos) to find candidates
}
// Try to grow the ngram to next level, and prune the appropriate extensions.
// The growth is done deapth-first rather than breadth-first, e.g. grow each candidate to its longest unpruned sequence
void SeqLearner::span_dfs (rule_t& rule, SNode *space, unsigned int size) {
std::vector <SNode *>& next = space->next;
// Check if ngram larger than maxsize allowed.
if (size > maxpat) return;
if(SNode::hasWildcardConstraints){
if(space->violateWildcardConstraint()) return;
}
if (!(next.size() == 1 && next[0] == 0)){
// If the extensions are already computed, iterate through them and check pruning condition.
if (! next.empty()) {
if (verbosity >= 3)
cout << "\n !next.empty()";
for (std::vector<SNode*>::iterator it = next.begin(); it != next.end(); ++it) {
if ((*it)->ne.compare("*") == 0) {
if (verbosity > 3)
cout << "\nFeature ends in *, skipping gradient and bound computation. Extending to next dfs level.";
// Expand each candidate DFS wise.
span_dfs(rule, *it, size + 1);
} else if (! can_prune_and_update_rule (rule, *it, size)) {
// Expand each candidate DFS wise.
span_dfs(rule, *it, size + 1);
}
}
} else {
// Candidates obtained by extension.
std::map<string, SNode> candidates;
createCandidatesExpansions(space, candidates);
// Keep only doc_ids for occurrences of current ngram.
space->shrink ();
next.reserve(candidates.size());
next.clear();
// Prepare the candidate extensions.
for (auto const &currCandidate : candidates) {
SNode* c = new SNode;
c->loc = currCandidate.second.loc;
std::vector<int>(c->loc).swap(c->loc);
c->ne = currCandidate.first;
c->prev = space;
c->next.clear ();
// Keep all the extensions of the current feature for future iterations.
// If we need to save memory we could sacrifice this storage.
next.push_back (c);
// If the last token is a gap, skip checking gradient and pruning bound, since this is the same as for ngram without last gap token.
// E.g., if we checked the gradient and bounds for "a" and didnt prune it, then the gradient and bounds for "a*" will be the same,
// so we can safely skip recomputing the gradient and bounds.
if (c->ne.compare("*") == 0) {
if (verbosity >= 3)
cout << "\nFeature ends in *, skipping gradient and bound computation. Extending to next dfs level.";
span_dfs(rule, c, size + 1);
} else // If this doesn't end in *, then check gradient and pruning condition.
if (! can_prune_and_update_rule (rule, c, size)) {
span_dfs(rule, c, size + 1);
}
}
if (next.empty()) {
next.push_back (0);
}
std::vector<SNode *>(next).swap (next);
}
}
}
// Line search method. Search for step size that minimizes loss.
// Compute loss in middle point of range, beta_n1, and
// for mid of both ranges beta_n0, beta_n1 and bet_n1, beta_n2
// Compare the loss for the 3 points, and choose range of 3 points
// which contains the minimum. Repeat until the range spanned by the 3 points is small enough,
// e.g. the range approximates well the vector where the loss function is minimized.
// Return the middle point of the best range.
void SeqLearner::find_best_range(vector<double>& sum_best_beta_n0, vector<double>& sum_best_beta_n1, vector<double>& sum_best_beta_n2,
vector<double>& sum_best_beta_mid_n0_n1, vector<double>& sum_best_beta_mid_n1_n2,
rule_t& rule, vector<double>* sum_best_beta_opt) {
double min_range_size = 1e-3;
double current_range_size = 0;
int current_interpolation_iter = 0;
long double loss_mid_n0_n1 = 0;
long double loss_mid_n1_n2 = 0;
long double loss_n1 = 0;
for (unsigned int i = 0; i < transaction.size(); ++i) {
if (verbosity > 4) {
cout << "\nsum_best_beta_n0[i]: " << sum_best_beta_n0[i];
cout << "\nsum_best_beta_n1[i]: " << sum_best_beta_n1[i];
cout << "\nsum_best_beta_n2[i]: " << sum_best_beta_n2[i];
}
current_range_size += abs(sum_best_beta_n2[i] - sum_best_beta_n0[i]);
}
if (verbosity > 3)
cout << "\ncurrent range size: " << current_range_size;
double beta_coef_n1 = 0;
double beta_coef_mid_n0_n1 = 0;
double beta_coef_mid_n1_n2 = 0;
if (C != 0 && sum_squared_betas != 0) {
features_it = features_cache.find(rule.ngram);
}
// Start interpolation loop.
while (current_range_size > min_range_size) {
if (verbosity > 3)
cout << "\ncurrent interpolation iteration: " << current_interpolation_iter;
for (unsigned int i = 0; i < transaction.size(); ++i) { //loop through training samples
sum_best_beta_mid_n0_n1[i] = (sum_best_beta_n0[i] + sum_best_beta_n1[i]) / 2;
sum_best_beta_mid_n1_n2[i] = (sum_best_beta_n1[i] + sum_best_beta_n2[i]) / 2;
if ( C != 0) {
// THACH: xi
beta_coef_n1 = (sum_best_beta_n1[rule.loc[0]] - sum_best_beta[rule.loc[0]]) / compute_xi(rule.dist[0]);
beta_coef_mid_n0_n1 = (sum_best_beta_mid_n0_n1[rule.loc[0]] - sum_best_beta[rule.loc[0]]) / compute_xi(rule.dist[0]);
beta_coef_mid_n1_n2 = (sum_best_beta_mid_n1_n2[rule.loc[0]] - sum_best_beta[rule.loc[0]]) / compute_xi(rule.dist[0]);
}
if (verbosity > 4) {
cout << "\nsum_best_beta_mid_n0_n1[i]: " << sum_best_beta_mid_n0_n1[i];
cout << "\nsum_best_beta_mid_n1_n2[i]: " << sum_best_beta_mid_n1_n2[i];
}
loss_n1 += computeLossTerm(sum_best_beta_n1[i], y[i]);
loss_mid_n0_n1 += computeLossTerm(sum_best_beta_mid_n0_n1[i], y[i]);
loss_mid_n1_n2 += computeLossTerm(sum_best_beta_mid_n1_n2[i], y[i]);
} //end loop through training samples.
if ( C != 0 ) {
// Add the Elastic Net regularization term.
// If this is the first ngram selected.
if (sum_squared_betas == 0) {
loss_n1 = loss_n1 + C * (alpha * abs(beta_coef_n1) + (1-alpha) * 0.5 * pow(beta_coef_n1, 2));
loss_mid_n0_n1 = loss_mid_n0_n1 + C * (alpha * abs(beta_coef_mid_n0_n1) + (1-alpha) * 0.5 * pow(beta_coef_mid_n0_n1, 2));
loss_mid_n1_n2 = loss_mid_n1_n2 + C * (alpha * abs(beta_coef_mid_n1_n2) + (1-alpha) * 0.5 * pow(beta_coef_mid_n1_n2, 2));
} else {
// If this feature was not selected before.
if (features_it == features_cache.end()) {
loss_n1 = loss_n1 + C * (alpha * (sum_abs_betas + abs(beta_coef_n1)) + (1 - alpha) * 0.5 * (sum_squared_betas + pow(beta_coef_n1, 2)));
loss_mid_n0_n1 = loss_mid_n0_n1 + C * (alpha * (sum_abs_betas + abs(beta_coef_mid_n0_n1)) + (1 - alpha) * 0.5 * (sum_squared_betas + pow(beta_coef_mid_n0_n1, 2)));
loss_mid_n1_n2 = loss_mid_n1_n2 + C * (alpha * (sum_abs_betas + abs(beta_coef_mid_n1_n2)) + (1 - alpha) * 0.5 * (sum_squared_betas + pow(beta_coef_mid_n1_n2, 2)));
} else {
double new_beta_coef_n1 = features_it->second + beta_coef_n1;
double new_beta_coef_mid_n0_n1 = features_it->second + beta_coef_mid_n0_n1;
double new_beta_coef_mid_n1_n2 = features_it->second + beta_coef_mid_n1_n2;
loss_n1 = loss_n1 + C * (alpha * (sum_abs_betas - abs(features_it->second) + abs(new_beta_coef_n1)) + (1 - alpha) * 0.5 * (sum_squared_betas - pow(features_it->second, 2) + pow(new_beta_coef_n1, 2)));
loss_mid_n0_n1 = loss_mid_n0_n1 + C * (alpha * (sum_abs_betas - abs(features_it->second) + abs(new_beta_coef_mid_n0_n1)) + (1 - alpha) * 0.5 * (sum_squared_betas - pow(features_it->second, 2) + pow(new_beta_coef_mid_n0_n1, 2)));
loss_mid_n1_n2 = loss_mid_n1_n2 + C * (alpha * (sum_abs_betas - abs(features_it->second) + abs(new_beta_coef_mid_n1_n2)) + (1 - alpha) * 0.5 * (sum_squared_betas - pow(features_it->second, 2) + pow(new_beta_coef_mid_n1_n2, 2)));
}
}
}// end check C != 0.
// Focus on the range that contains the minimum of the loss function.
// Compare the 3 points beta_n, and mid_beta_n-1_n and mid_beta_n_n+1.
if (loss_n1 <= loss_mid_n0_n1 && loss_n1 <= loss_mid_n1_n2) {
// Min is in beta_n1.
if (verbosity > 4) {
cout << "\nmin is sum_best_beta_n1";
cout << "\nloss_mid_n0_n1: " << loss_mid_n0_n1;
cout << "\nloss_n1: " << loss_n1;
cout << "\nloss_mid_n1_n2: " << loss_mid_n1_n2;
}
// Make the beta_n0 be the beta_mid_n0_n1.
sum_best_beta_n0.assign(sum_best_beta_mid_n0_n1.begin(), sum_best_beta_mid_n0_n1.end());
// Make the beta_n2 be the beta_mid_n1_n2.
sum_best_beta_n2.assign(sum_best_beta_mid_n1_n2.begin(), sum_best_beta_mid_n1_n2.end());
}
else {
if (loss_mid_n0_n1 <= loss_n1 && loss_mid_n0_n1 <= loss_mid_n1_n2) {
// Min is beta_mid_n0_n1.
if (verbosity > 4) {
cout << "\nmin is sum_best_beta_mid_n0_n1";
cout << "\nloss_mid_n0_n1: " << loss_mid_n0_n1;
cout << "\nloss_n1: " << loss_n1;
cout << "\nloss_mid_n1_n2: " << loss_mid_n1_n2;
}
// Make the beta_n2 be the beta_n1.
sum_best_beta_n2.assign(sum_best_beta_n1.begin(), sum_best_beta_n1.end());
// Make the beta_n1 be the beta_mid_n0_n1.
sum_best_beta_n1.assign(sum_best_beta_mid_n0_n1.begin(), sum_best_beta_mid_n0_n1.end());
} else {
// Min is beta_mid_n1_n2.
if (verbosity > 4) {
cout << "\nmin is sum_best_beta_mid_n1_n2";
cout << "\nloss_mid_n0_n1: " << loss_mid_n0_n1;
cout << "\nloss_n1: " << loss_n1;
cout << "\nloss_mid_n1_n2: " << loss_mid_n1_n2;
}
// Make the beta_n0 be the beta_n1.
sum_best_beta_n0.assign(sum_best_beta_n1.begin(), sum_best_beta_n1.end());
// Make the beta_n1 be the beta_mid_n1_n2
sum_best_beta_n1.assign(sum_best_beta_mid_n1_n2.begin(), sum_best_beta_mid_n1_n2.end());
}
}
++current_interpolation_iter;
loss_mid_n0_n1 = 0;
loss_mid_n1_n2 = 0;
loss_n1 = 0;
current_range_size = 0;
for (unsigned int i = 0; i < transaction.size(); ++i) {
if (verbosity > 4) {
cout << "\nsum_best_beta_n0[i]: " << sum_best_beta_n0[i];
cout << "\nsum_best_beta_n1[i]: " << sum_best_beta_n1[i];
cout << "\nsum_best_beta_n2[i]: " << sum_best_beta_n2[i];
}
current_range_size += abs(sum_best_beta_n2[i] - sum_best_beta_n0[i]);
}
if (verbosity > 4) {
cout << "\ncurrent range size: " << current_range_size;
}
} // end while loop.
// Keep the middle point of the best range.
for (unsigned int i = 0; i < transaction.size(); ++i) {
sum_best_beta_opt->push_back(sum_best_beta_n1[i]);
// Trust this step only by a fraction.
//sum_best_beta_opt->push_back(0.5 * sum_best_beta[i] + 0.5 * sum_best_beta_n1[i]);
}
} // end find_best_range().
// Line search method. Binary search for optimal step size. Calls find_best_range(...).
// sum_best_beta keeps track of the scalar product beta_best^t*xi for each doc xi.
// Instead of working with the new weight vector beta_n+1 obtained as beta_n - epsilon * gradient(beta_n)
// we work directly with the scalar product.
// We output the sum_best_beta_opt which contains the scalar poduct of the optimal beta found, by searching for the optimal
// epsilon, e.g. beta_n+1 = beta_n - epsilon_opt * gradient(beta_n)
// epsilon is the starting value
// rule contains info about the gradient at the current iteration
void SeqLearner::binary_line_search(rule_t& rule, vector<double>* sum_best_beta_opt) {
sum_best_beta_opt->clear();
// Starting value for parameter in step size search.
// Set the initial epsilon value small enough to guaranteee
// log-like increases in the first steps.
double exponent = ceil(log10(abs(rule.gradient)));
double epsilon = min(1e-3, pow(10, -exponent));
if (verbosity > 3) {
cout << "\nrule.ngram: " << rule.ngram;
cout << "\nrule.gradient: " << rule.gradient;
cout << "\nexponent of epsilon: " << -exponent;
cout << "\nepsilon: " << epsilon;
}
// Keep track of scalar product at points beta_n-1, beta_n and beta_n+1.
// They are denoted with beta_n0, beta_n1, beta_n2.
// vector<double> sum_best_beta_n0(sum_best_beta.size());
vector<double> sum_best_beta_n0(sum_best_beta);
vector<double> sum_best_beta_n1(sum_best_beta);
vector<double> sum_best_beta_n2(sum_best_beta);
// Keep track of loss at the three points, n0, n1, n2.
long double loss_n0 = 0;
long double loss_n1 = 0;
long double loss_n2 = 0;
// Binary search for epsilon. Similar to bracketing phase in which
// we search for some range with promising epsilon.
// The second stage finds the epsilon or corresponding weight vector with smallest l2-loss value.
// **************************************************************************/
// As long as the l2-loss decreases, double the epsilon.
// Keep track of the last three values of beta, or correspondingly
// the last 3 values for the scalar product of beta and xi.
int n = 0;
if ( C != 0 && sum_squared_betas != 0) {
features_it = features_cache.find(rule.ngram);
}
double beta_coeficient_update = 0;
do {
if (verbosity > 3)
cout << "\nn: " << n;
// For each location (e.g. docid), update the score of the documents containing best rule.
// E.g. update beta^t * xi.
beta_coeficient_update -= pow(2, n * 1.0) * epsilon * rule.gradient;
for (unsigned int i = 0; i < rule.loc.size(); ++i) {
sum_best_beta_n0[rule.loc[i]] = sum_best_beta_n1[rule.loc[i]];
sum_best_beta_n1[rule.loc[i]] = sum_best_beta_n2[rule.loc[i]];
sum_best_beta_n2[rule.loc[i]] = sum_best_beta_n1[rule.loc[i]] - pow(2, n * 1.0) * epsilon * rule.gradient * compute_xi(rule.dist[i]); //THACH: xi
if (verbosity > 4 && i == 0) {
cout << "\nsum_best_beta_n0[rule.loc[i]: " << sum_best_beta_n0[rule.loc[i]];
cout << "\nsum_best_beta_n1[rule.loc[i]: " << sum_best_beta_n1[rule.loc[i]];
cout << "\nsum_best_beta_n2[rule.loc[i]: " << sum_best_beta_n2[rule.loc[i]];
}
}
// Compute loss for all 3 values: n-1, n, n+1
// In the first iteration compute necessary loss.
if (n == 0) {
loss_n0 = loss;
loss_n1 = loss;
} else {
// Update just loss_n2.
// The loss_n0 and loss_n1 are already computed.
loss_n0 = loss_n1;
loss_n1 = loss_n2;
}
loss_n2 = 0;
computeLoss(loss_n2, sum_best_beta_n2);
if (verbosity > 4) {
cout << "\nloss_n2 before adding regularizer: " << loss_n2;
}
// If C != 0, add the L2 regularizer term to the l2-loss.
// If this is the first ngram selected.
if ( C != 0 ) {
if (sum_squared_betas == 0) {
loss_n2 = loss_n2 + C * (alpha * abs(beta_coeficient_update) + (1 - alpha) * 0.5 * pow(beta_coeficient_update, 2));
if (verbosity > 4) {
cout << "\nregularizer: " << C * (alpha * abs(beta_coeficient_update) + (1 - alpha) * 0.5 * pow(beta_coeficient_update, 2));
}
} else {
// If this feature was not selected before.
if (features_it == features_cache.end()) {
loss_n2 = loss_n2 + C * (alpha * (sum_abs_betas + abs(beta_coeficient_update)) + (1 - alpha) * 0.5 * (sum_squared_betas +
pow(beta_coeficient_update, 2)));
} else {
double new_beta_coeficient = features_it->second + beta_coeficient_update;
loss_n2 = loss_n2 + C * (alpha * (sum_abs_betas - abs(features_it->second) + abs(new_beta_coeficient)) + (1 - alpha) * 0.5 * (sum_squared_betas - pow(features_it->second, 2) + pow(new_beta_coeficient, 2)));
}
}
} // end C != 0.
if (verbosity > 4) {
cout << "\nloss_n0: " << loss_n0;
cout << "\nloss_n1: " << loss_n1;
cout << "\nloss_n2: " << loss_n2;
}
++n;
} while (loss_n2 < loss_n1);
// **************************************************************************/
if (verbosity > 3)
cout << "\nFinished doubling epsilon! The monotonicity loss_n+1 < loss_n is broken!";
// Search for the beta in the range beta_n-1, beta_mid_n-1_n, beta_n, beta_mid_n_n+1, beta_n+1
// that minimizes the objective function. It suffices to compare the 3 points beta_mid_n-1_n, beta_n, beta_mid_n_n+1,
// as the min cannot be achieved at the extrem points of the range.
// Take the 3 point range containing the point that achieves minimum loss.
// Repeat until the 3 point range is too small, or a fixed number of iterations is achieved.
// **************************************************************************/
vector<double> sum_best_beta_mid_n0_n1(sum_best_beta.size());
vector<double> sum_best_beta_mid_n1_n2(sum_best_beta.size());
find_best_range(sum_best_beta_n0, sum_best_beta_n1, sum_best_beta_n2,
sum_best_beta_mid_n0_n1, sum_best_beta_mid_n1_n2,
rule, sum_best_beta_opt);
// **************************************************************************/
} // end binary_line)search().
// Searches the space of all subsequences for the ngram with the ngram with the maximal abolute gradient and saves it in rule
SeqLearner::rule_t SeqLearner::findBestNgram(rule_t& rule ,std::vector <SNode*>& old_space, std::vector<SNode*>& new_space, std::map<string, SNode>& seed){
// Reset
tau = 0;
rule.ngram = "";
rule.gradient = 0;
rule.loc.clear ();
rule.dist.clear ();
rule.size = 0;
old_space.clear ();
new_space.clear ();
pruned = total = rewritten = 0;
//cout << "\nIterate through unigrams." << endl;
for (auto &unigram : seed) {
if (!can_prune_and_update_rule (rule, &unigram.second, 1)) {
// Check BFS vs DFS traversal.
if (!traversal_strategy) {
old_space.push_back (&unigram.second);
} else {
// Traversal is DFS.
span_dfs (rule, &unigram.second, 2);
}
}
}
// If BFS traversal.
if (!traversal_strategy) {
//cout << "BFS traversing..." << endl;
// Search for best n-gram. Try to extend in a bfs fashion,
// level per level, e.g., first extend unigrams to bigrams, then bigrams to trigrams, etc.
//*****************************************************/
for (unsigned int size = 2; size <= maxpat; ++size) {
for (unsigned int i = 0; i < old_space.size(); ++i) {
span_bfs (rule, old_space[i], new_space, size);
}
if (new_space.empty()) {
break;
}
old_space = new_space;
new_space.clear ();