forked from UnitexGramLab/unitex-core
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBuildTextAutomaton.cpp
1398 lines (1352 loc) · 62.5 KB
/
BuildTextAutomaton.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Unitex
*
* Copyright (C) 2001-2018 Université Paris-Est Marne-la-Vallée <[email protected]>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*
*/
#include "BuildTextAutomaton.h"
#include "StringParsing.h"
#include "List_ustring.h"
#include "Error.h"
#include "SingleGraph.h"
#include "DELA.h"
#include "NormalizationFst2.h"
#include "BitMasks.h"
#include "Transitions.h"
#include "HashTable.h"
#include "Vector.h"
#include "Tfst.h"
#include "Korean.h"
#include "File.h"
#include "AbstractFst2Load.h"
#include "UnusedParameter.h"
#include "TfstStats.h"
#include "Ustring.h"
#ifndef HAS_UNITEX_NAMESPACE
#define HAS_UNITEX_NAMESPACE 1
#endif
namespace unitex {
/**
* This function returns the number of space tokens that are in the
* given buffer.
*/
int count_non_space_tokens(const int* buffer, int length, int SPACE) {
int n = 0;
for (int i = 0; i < length; i++) {
if (buffer[i] != SPACE) {
n++;
}
}
return n;
}
/**
* This function explores in parallel a text token 'token' and the dictionary tree
* 'tree'. 'n' is the current node in this tree. 'pos' is the current position in
* 'token'. 'inflected' is the inflected form as in the text, i.e. with the same case.
* It's not the same than 'token' because it can be a composed inflected form.
* 'pos_inflected' is the current position in this string. 'state' is the current
* state in the sentence automaton we are building, so, if we find transitions to
* add, we will add them from it. 'shift' represents the number of non space tokens
* in 'inflected' and 'start_node_index' represents the index of the current
* state. We use it to guess the number of the state that must be pointed out
* by transitions. For instance, if the first state is 6 and if we have to add
* a transition tagged with the inflected form "black-eyed", we know that the state
* to reach will be 6+3=9. We also use 'shift' to know if we are dealing with the first
* token of the inflected form. This is useful to determine if the first token is
* an unknown token or not. 'current_token_index' is the position of the current token
* in the token buffer. 'tmp_tags' represents the tags of the current graph.
*/
void explore_dictionary_tree(int pos, const unichar* token, unichar* inflected,
int pos_inflected, const struct string_hash_tree_node* n,
const struct DELA_tree* tree, struct info* INFO,
SingleGraphState state, int shift, int start_state_index,
int *is_not_unknown_token, int first_token_index,
int current_token_index, struct string_hash* tmp_tags, Ustring* foo,
language_t* language, unichar* tag_buffer) {
if (token[pos] == '\0') {
if (shift == 1 && n->value_index != -1) {
/* If we are on the first token and if there are some DELA entries for it,
* then it is not an unknown one */
(*is_not_unknown_token) = 1;
}
struct dela_entry_list* list = NULL;
if (n->value_index != -1)
list = tree->dela_entries[n->value_index];
inflected[pos_inflected] = '\0';
while (list != NULL) {
/* If there are DELA entries for the current inflected form,
* we add the corresponding transitions to the automaton */
struct dela_entry* entry = list->entry;
if (language != NULL) {
entry = filter_dela_entry(list->entry, NULL, language, 0);
}
if (entry != NULL) {
//unichar tag[4096];
unichar* tag = tag_buffer;
build_tag(entry, inflected, tag);
u_sprintf(foo, "@STD\n@%S\n@%d.0.0-%d.%d.0\n.\n", tag,
first_token_index, current_token_index, pos - 1);
add_outgoing_transition(state, get_value_index(foo->str,
tmp_tags), start_state_index + shift);
if (entry != list->entry) {
free_dela_entry(entry);
}
}
list = list->next;
}
/* We try to go on with the next token in the sentence */
if (current_token_index < INFO->length_max - 1) {
/* but only if we are not at the end of the sentence */
if (INFO->buffer[current_token_index] != INFO->SPACE) {
/* The states do not take spaces into acccount, so, if we have read
* a space, we do nothing; otherwise, we can move one state right */
shift++;
}
current_token_index++;
explore_dictionary_tree(0,
INFO->tok->token[INFO->buffer[current_token_index]],
inflected, pos_inflected, n, tree, INFO, state, shift,
start_state_index, is_not_unknown_token, first_token_index,
current_token_index, tmp_tags, foo, language, tag_buffer);
}
return;
}
struct string_hash_tree_transition* trans = n->trans;
inflected[pos_inflected] = token[pos];
while (trans != NULL) {
if (is_equal_or_uppercase(trans->letter, token[pos], INFO->alph)) {
/* For each transition, we follow it if its letter matches the
* token one */
explore_dictionary_tree(pos + 1, token, inflected, pos_inflected
+ 1, trans->node, tree, INFO, state, shift,
start_state_index, is_not_unknown_token, first_token_index,
current_token_index, tmp_tags, foo, language, tag_buffer);
}
trans = trans->next;
}
}
/**
* Returns 1 if the given tag description seems to describe a {xx,xx.xx} tag; 0 otherwise.
*/
int is_high_weight_tag(unichar* s) {
if (u_starts_with(s, "@<E>\n")) {
/* If we have an EPSILON tag, we consider it as a normal tag, in order
* not to penalize the best path heuristic */
return 1;
}
if (u_starts_with(s, "@STD\n")) {
if (s[6] == '{' && s[7] != '\n') {
return 1;
}
return 0;
}
fatal_error("Unsupported tag type in is_high_weight_tag\n");
return 0;
}
/**
* This function explores the given graph, trying to find for each state the path with
* the lowest number of unknown tokens made of letters that leads to it. These
* numbers of tokens are stored in 'weight'.
*/
void compute_best_paths(int state, SingleGraph graph, int* weight,
struct string_hash* tmp_tags) {
int w;
w = weight[state];
Transition* trans = graph->states[state]->outgoing_transitions;
while (trans != NULL) {
unichar* tmp = tmp_tags->value[trans->tag_number];
int w_tmp = w;
if (!is_high_weight_tag(tmp)) {
/* If the transition is tagged by an unknown token made of letters */
w_tmp = w_tmp + 1;
}
if (w_tmp < weight[trans->state_number]) {
/* If we have a better path than the existing one, we keep it */
weight[trans->state_number] = w_tmp;
/* and we mark the destination state as modified */
set_bit_mask(&(graph->states[trans->state_number]->control),
MARK1_BIT_MASK);
}
trans = trans->next;
}
trans = graph->states[state]->outgoing_transitions;
while (trans != NULL) {
if (is_bit_mask_set(graph->states[trans->state_number]->control,
MARK1_BIT_MASK)) {
/* If the state was modified, we reset its control value */
unset_bit_mask(&(graph->states[trans->state_number]->control),
MARK1_BIT_MASK);
/* and we explore it recursively */
compute_best_paths(trans->state_number, graph, weight, tmp_tags);
}
trans = trans->next;
}
}
/**
* Before this function is called, for each state i, weight[i] must have been
* set with the minimal weight of the state #i, that is to say the minimal number
* of transitions tagged with unknown tokens made of letters to follow to reach
* this state. Then, this function takes a list of transitions and removes all
* those that reach a state #x with a weight > weight[x].
*/
Transition* remove_bad_path_transitions(int min_weight, Transition* trans,
SingleGraph graph, int* weight, struct string_hash* tmp_tags) {
if (trans == NULL)
return NULL;
unichar* s = tmp_tags->value[trans->tag_number];
if ((!is_high_weight_tag(s) && weight[trans->state_number] < (min_weight
+ 1)) || (weight[trans->state_number] < min_weight)) {
/* If we have to remove the transition */
Transition* tmp = trans->next;
free(trans);
return remove_bad_path_transitions(min_weight, tmp, graph, weight,
tmp_tags);
}
trans->next = remove_bad_path_transitions(min_weight, trans->next, graph,
weight, tmp_tags);
return trans;
}
/**
* This function applies the following heuristic to remove paths:
* if there are several paths to go from A to B, then we will keep those
* that are tagged with the smallest number of unknown token made of letters.
* For instance, if we have the 2 concurrent paths:
*
* Aujourd ' {hui,huir.V:Kms}
* {Aujourd'hui,aujourd'hui.ADV+z1}
*
* we will remove the first one because it contains 1 unkown token made of letters
* ("Aujourd") while the second contains none.
*/
void keep_best_paths(SingleGraph graph, struct string_hash* tmp_tags) {
int i;
int* weight = (int*) malloc(sizeof(int) * graph->number_of_states);
if (weight == NULL) {
fatal_alloc_error("keep_best_paths");
}
/* We initialize the initial state with 0 untagged transition */
weight[0] = 0;
/* and all other states with a value that is larger that the
* longest possible path. As the automaton is acyclic, the longest
* path cannot be greater than the number of states */
for (i = 1; i < graph->number_of_states; i++) {
weight[i] = graph->number_of_states;
}
compute_best_paths(0, graph, weight, tmp_tags);
for (i = 0; i < graph->number_of_states; i++) {
graph->states[i]->outgoing_transitions = remove_bad_path_transitions(
weight[i], graph->states[i]->outgoing_transitions, graph,
weight, tmp_tags);
}
free(weight);
}
/**
* Allocates, initializes and returns a struct output_info*
*/
struct output_info* new_output_info(unichar* tag) {
if (tag == NULL) {
fatal_error("Invalid NULL tag in new_output_info\n");
}
struct output_info* x = (struct output_info*) malloc(
sizeof(struct output_info));
if (x == NULL) {
fatal_alloc_error("new_output_info");
}
x->output = u_strdup(tag);
if (tag[0] == '{' && tag[1] != '\0') {
struct dela_entry* entry = tokenize_tag_token(tag,1);
if (entry == NULL) {
free_dela_entry(entry);
free(x->output);
free(x);
error("new_output_info: Invalid tag token %S\n", tag);
return NULL;
}
x->content = u_strdup(entry->inflected);
free_dela_entry(entry);
} else {
x->content = u_strdup(tag);
}
x->start_pos = -1;
x->end_pos = -1;
x->start_pos_char = -1;
x->end_pos_char = -1;
/* We can initialize xxx_letter fields to 0, because in non Korean mode, there
* is exactly one letter per character. Korean case will be especially handled */
x->start_pos_letter = 0;
x->end_pos_letter = 0;
return x;
}
/**
* Frees all the memory associated to the given struct output_info*
*/
void free_output_info(struct output_info* x) {
if (x == NULL)
return;
free(x->output);
free(x->content);
free(x);
}
/**
* This function takes an output s like " {de,.PREP} {le,.DET} "
* and returns a vector containing description of the tags that must be produced:
*
* "{de,.PREP}" and "{le,.DET}"
*
* The vector contains struct output_info*
*/
vector_ptr* tokenize_normalization_output(unichar* s, const Alphabet* alph) {
if (s == NULL)
return NULL;
vector_ptr* result = new_vector_ptr(4);
unichar tmp[2048];
int i;
int j;
i = 0;
while (s[i] != '\0') {
while (s[i] == ' ') {
/* We ignore spaces */
i++;
}
if (s[i] != '\0') {
/* If we are not at the end of the string */
if (s[i] == '{') {
/* Case of a tag like "{de,.PREP}" */
j = 0;
while (s[i] != '\0' && s[i] != '}') {
tmp[j++] = s[i++];
}
if (s[i] != '\0') {
/* The end of string is an error (no closing '}'), so we save the
* tag only if it is a valid one */
tmp[j] = '}';
tmp[j + 1] = '\0';
/* We go on the next char */
i++;
struct output_info* foo=new_output_info(tmp);
if (foo!=NULL) vector_ptr_add(result, foo);
}
} else if (is_letter(s[i], alph)) {
/* Case of a letter sequence like "Rivoli" */
j = 0;
while (is_letter(s[i], alph)) {
tmp[j++] = s[i++];
}
tmp[j] = '\0';
/* We don't have to go on the next char, we are already on it */
struct output_info* foo=new_output_info(tmp);
if (foo!=NULL) vector_ptr_add(result, foo);
} else {
/* Case of a single non-space char like "-" */
tmp[0] = s[i];
tmp[1] = '\0';
/* We go on the next char of the string */
i++;
struct output_info* foo=new_output_info(tmp);
if (foo!=NULL) vector_ptr_add(result, foo);
}
}
}
if (result->nbelems == 0) {
free_vector_ptr(result, (void(*)(void*)) free_output_info);
return NULL;
}
return result;
}
/* This value must be negative */
#define UNDEF_KR_TAG_START -6
/**
* This function does its best to compute the start/end values for each tag
* to be produced. It is an adaptation for Korean of 'solve_alignment_puzzle'.
*
* OK, I (S.P.) know that it's bad to duplicate code. But it's even worse to
* fight with inner mysteries of Korean Hangul/Jamo tricks. Trust me.
*/
void solve_alignment_puzzle_for_Korean(vector_ptr* vector, int start, int end,
struct info* INFO, Korean* korean, unichar* output) {
if (vector == NULL) {
fatal_error("NULL vector in solve_alignment_puzzle\n");
}
if (vector->nbelems == 0) {
fatal_error("Empty vector in solve_alignment_puzzle\n");
}
struct output_info** tab = (struct output_info**) vector->tab;
if (vector->nbelems == 1) {
/* If there is only one tag, there is no puzzle to solve */
tab[0]->start_pos = start;
tab[0]->start_pos_char = 0;
tab[0]->end_pos = end;
tab[0]->end_pos_char = u_strlen(INFO->tok->token[INFO->buffer[end]])
- 1;
tab[0]->end_pos_letter = get_length_in_jamo(
INFO->tok->token[INFO->buffer[end]][tab[0]->end_pos_char],
korean) - 1;
return;
}
/* We try to find an exact match (modulo case variations) between the text and the tags' content */
int current_token = start;
int current_tag = 0;
int current_pos_in_char_in_token = 0;
/* Warning: the next 2 variable are not to be confused.
* - current_index_in_jamo_token is the index used to browse the jamo string
* - current_pos_in_letter_in_token is not the same, since it does not take syllable bounds
* into account. Moreover, this one is reinitialized to 0 everytime we cross a syllable
* bound
*/
int current_index_in_jamo_token = 0;
int current_pos_in_letter_in_token = 0;
/* The same for the next 2 variables */
int current_index_in_jamo_tag = 0;
tab[current_tag]->start_pos = current_token;
tab[current_tag]->start_pos_char = 0;
unichar* token = INFO->tok->token[INFO->buffer[current_token]];
unichar jamo_token[256];
unichar jamo_tag[256];
Hanguls_to_Jamos(token, jamo_token, korean, 0);
if (!u_strcmp(tab[current_tag]->content, "<E>")) {
fatal_error(
"solve_alignment_puzzle_for_Korean: {<E>,xxx.yyy} tag is not supposed to start a tag sequence at line:\n%S\n",
output);
}
Hanguls_to_Jamos(tab[current_tag]->content, jamo_tag, korean, 0);
int everything_still_OK = 1;
while (everything_still_OK) {
if (jamo_tag[current_index_in_jamo_tag] == '\0') {
/* If we are at the end of a tag */
tab[current_tag]->end_pos = current_token;
tab[current_tag]->end_pos_char = current_pos_in_char_in_token;
/* letter-1 because we moved on */
tab[current_tag]->end_pos_letter = current_pos_in_letter_in_token
- 1;
if (current_tag == vector->nbelems - 1) {
/* If we are at the end of the last tag, we must also be
* at the end of the last token if we want to have a perfect alignment */
if (current_token == end
&& jamo_token[current_index_in_jamo_token] == '\0') {
return;
}
break;
}
/* We are not at the last tag, so we have to move to the next tag.
* However, we have to deal with the special case of tags like
* {<E>,xx.yy}. For those tags, we don't move in the text token,
* and we have to set end_pos_letter=-1. As we don't know if several
* <E> tags can be combined, we use a loop for safety */
for (;;) {
if (current_tag == vector->nbelems - 1) {
/* We have taken the loop more than once, and we have reached the last tag.
* Then, we must also be at the end of the last token if we want to have a
* perfect alignment */
if (current_token == end
&& jamo_token[current_index_in_jamo_token] == '\0') {
return;
}
/* If not, we fail */
everything_still_OK = 0;
break;
}
current_tag++;
tab[current_tag]->start_pos = current_token;
tab[current_tag]->start_pos_char = UNDEF_KR_TAG_START;//current_pos_in_char_in_token;
tab[current_tag]->start_pos_letter = UNDEF_KR_TAG_START;//current_pos_in_letter_in_token;
current_index_in_jamo_tag = 0;
if (!u_strcmp(tab[current_tag]->content, "<E>")) {
/* If we have a {<E>,xx.yy} tag */
tab[current_tag]->start_pos_char
= current_pos_in_char_in_token;
tab[current_tag]->start_pos_letter
= current_pos_in_letter_in_token - 1;
tab[current_tag]->end_pos = tab[current_tag]->start_pos;
tab[current_tag]->end_pos_char
= tab[current_tag]->start_pos_char;
tab[current_tag]->end_pos_letter = -1;
} else {
/* We have a non <E> tag, so we can set 'jamo_tag' and get out of the for(;;) loop */
Hanguls_to_Jamos(tab[current_tag]->content, jamo_tag,
korean, 0);
break;
}
} /* End of for(;;) loop to move on next tag */
continue;
}
/* We are not at the end of a tag, but we can be at the end of the current token */
if (jamo_token[current_index_in_jamo_token] == '\0') {
if (current_token == end) {
/* If this is the last token, then we cannot have a perfect alignment */
break;
}
current_token++;
if (tab[current_tag]->start_pos_char == UNDEF_KR_TAG_START) {
/* If we change of token whereas we have not started to read the current tag,
* we can say that the current tag starts on the current token */
(tab[current_tag]->start_pos)++;
tab[current_tag]->start_pos_char = 0;
tab[current_tag]->start_pos_letter = 0;
}
if (INFO->buffer[current_token] == INFO->SPACE
&& current_index_in_jamo_tag == 0) {
/* If 1) the new token is a space and 2) we are at the beginning of a tag
* (that, by construction, cannot start with a space), then we must skip this
* space token */
if (current_token == end) {
/* If this space token was the last token, then we cannot have a perfect alignement */
break;
}
/* By construction, we cannot have 2 contiguous spaces, but we raise an error in
* order not to forget that the day this rule will change */
current_token++;
/* See comment above */
(tab[current_tag]->start_pos)++;
tab[current_tag]->start_pos_char = 0;
tab[current_tag]->start_pos_letter = 0;
if (INFO->buffer[current_token] == INFO->SPACE) {
fatal_error(
"Contiguous spaces not handled in solve_alignment_puzzle_for_Korean at line:\n%S\n",
output);
}
}
token = INFO->tok->token[INFO->buffer[current_token]];
current_pos_in_char_in_token = 0;
current_index_in_jamo_token = 0;
current_pos_in_letter_in_token = 0;
Hanguls_to_Jamos(token, jamo_token, korean, 0);
continue;
}
if (jamo_token[current_index_in_jamo_token] == KR_SYLLABLE_BOUND) {
/* If we find a syllable bound in the jamo token, we have to change the current
* character and to update start_pos_letter to 0 */
current_index_in_jamo_token++;
if (jamo_token[current_index_in_jamo_token] == KR_SYLLABLE_BOUND) {
fatal_error(
"Contiguous syllable bounds in jamo token in solve_alignment_puzzle_for_Korean at line:\n%S\n",
output);
}
if (jamo_token[current_index_in_jamo_token] == '\0') {
fatal_error(
"Unexpected end of jamo token in solve_alignment_puzzle_for_Korean at line:\n%S\n",
output);
}
if (current_index_in_jamo_token != 1) {
/* We update the char position only if we find a syllable bound that is not the
* first one */
current_pos_in_char_in_token++;
}
current_pos_in_letter_in_token = 0;
/* No need to 'continue' here; we can deal with the jamo tag first */
/* continue; */
}
if (tab[current_tag]->start_pos_char == UNDEF_KR_TAG_START) {
/* If the start_pos_... of the new jamo tag have not been set yet, we do it now */
tab[current_tag]->start_pos_char = current_pos_in_char_in_token;
tab[current_tag]->start_pos_letter = current_pos_in_letter_in_token;
}
if (jamo_tag[current_index_in_jamo_tag] == KR_SYLLABLE_BOUND) {
current_index_in_jamo_tag++;
if (jamo_tag[current_index_in_jamo_tag] == KR_SYLLABLE_BOUND) {
fatal_error(
"Contiguous syllable bounds in jamo tag in solve_alignment_puzzle_for_Korean at line:\n%S\n",
output);
}
if (jamo_tag[current_index_in_jamo_tag] == '\0') {
fatal_error(
"Unexpected end of jamo tag in solve_alignment_puzzle_for_Korean at line:\n%S\n",
output);
}
}
/* If we arrive here, we have 2 characters to be compared in the token and the tag */
if (jamo_tag[current_index_in_jamo_tag]
!= jamo_token[current_index_in_jamo_token]) {
break;
}
current_index_in_jamo_tag++;
current_index_in_jamo_token++;
current_pos_in_letter_in_token++;
}
/* Default case: all tags will have the same start/end bounds */
for (int i = 0; i < vector->nbelems; i++) {
tab[i]->start_pos = start;
tab[i]->start_pos_char = 0;
tab[i]->end_pos = end;
tab[i]->end_pos_char = u_strlen(INFO->tok->token[INFO->buffer[end]])
- 1;
tab[i]->end_pos_letter = get_length_in_jamo(
INFO->tok->token[INFO->buffer[end]][tab[i]->end_pos_char],
korean) - 1;
}
}
/**
* This function does its best to compute the start/end values for each tag
* to be produced.
*/
void solve_alignment_puzzle(vector_ptr* vector, int start, int end,
struct info* INFO, const Alphabet* alph, Korean* korean,
unichar* output) {
if (korean != NULL) {
/* The Korean case is so special that it seems risky to merge it with the
* normal case that works fine */
solve_alignment_puzzle_for_Korean(vector, start, end, INFO, korean,
output);
return;
}
if (vector == NULL) {
fatal_error("NULL vector in solve_alignment_puzzle\n");
}
if (vector->nbelems == 0) {
fatal_error("Empty vector in solve_alignment_puzzle\n");
}
struct output_info** tab = (struct output_info**) vector->tab;
if (vector->nbelems == 1) {
/* If there is only one tag, there is no puzzle to solve */
tab[0]->start_pos = start;
tab[0]->start_pos_char = 0;
tab[0]->end_pos = end;
tab[0]->end_pos_char = u_strlen(INFO->tok->token[INFO->buffer[end]])
- 1;
/* No need to deal with xxx_letter fields in non Korean mode, since
* default initialization to 0 is OK */
return;
}
/* We try to find an exact match (modulo case variations) between the text and the tags' content */
int current_token = start;
int current_tag = 0;
int current_pos_in_char_in_token = 0;
int current_pos_in_char_in_tag = 0;
tab[current_tag]->start_pos = current_token;
tab[current_tag]->start_pos_char = 0;
unichar* token = INFO->tok->token[INFO->buffer[current_token]];
for (;;) {
if (!u_strcmp(tab[current_tag]->content, "<E>")) {
/* If we have a {<E>,xx.yy} tag */
tab[current_tag]->start_pos_char = current_pos_in_char_in_token;
tab[current_tag]->start_pos_letter = 0;
tab[current_tag]->end_pos = tab[current_tag]->start_pos;
tab[current_tag]->end_pos_char = tab[current_tag]->start_pos_char;
tab[current_tag]->end_pos_letter = -1;
if (current_tag == vector->nbelems - 1) {
/* If we are at the end of the last tag, we must also be
* at the end of the last token if we want to have a perfect alignment */
if (current_token==end && token[current_pos_in_char_in_token]=='\0') {
return;
}
break;
}
current_tag++;
tab[current_tag]->start_pos = current_token;
tab[current_tag]->start_pos_char = current_pos_in_char_in_token;
current_pos_in_char_in_tag = 0;
continue;
}
if (tab[current_tag]->content[current_pos_in_char_in_tag] == '\0') {
/* If we are at the end of a tag */
tab[current_tag]->end_pos = current_token;
tab[current_tag]->end_pos_char = current_pos_in_char_in_token - 1;
if (current_tag == vector->nbelems - 1) {
/* If we are at the end of the last tag, we must also be
* at the end of the last token if we want to have a perfect alignment */
if (current_token == end && token[current_pos_in_char_in_token]
== '\0') {
return;
}
break;
}
/* We are not at the last tag */
current_tag++;
tab[current_tag]->start_pos = current_token;
tab[current_tag]->start_pos_char = current_pos_in_char_in_token;
current_pos_in_char_in_tag = 0;
continue;
}
/* We are not at the end of a tag, but we can be at the end of the current token */
if (token[current_pos_in_char_in_token] == '\0') {
if (current_token == end) {
/* If this is the last token, then we cannot have a perfect alignement */
break;
}
current_token++;
if (current_pos_in_char_in_tag == 0) {
/* If we change of token whereas we have not started to read the current tag,
* we can say that the current tag starts on the current token */
(tab[current_tag]->start_pos)++;
tab[current_tag]->start_pos_char = 0;
}
if (INFO->buffer[current_token] == INFO->SPACE
&& current_pos_in_char_in_tag == 0) {
/* If 1) the new token is a space and 2) we are at the beginning of a tag
* (that, by construction, cannot start with a space), then we must skip this
* space token */
if (current_token == end) {
/* If this space token was the last token, then we cannot have a perfect alignement */
/*debug("current tag=%S/%d current token=%S/%d\n",tab[current_tag]->content,
current_pos_in_char_in_tag,token,current_pos_in_char_in_token);*/
break;
}
/* By construction, we cannot have 2 contiguous spaces, but we raise an error in
* order not to forget that the day this rule will change */
current_token++;
/* See comment above */
(tab[current_tag]->start_pos)++;
tab[current_tag]->start_pos_char = 0;
if (INFO->buffer[current_token] == INFO->SPACE) {
fatal_error(
"Contiguous spaces not handled in solve_alignment_puzzle\n");
}
}
token = INFO->tok->token[INFO->buffer[current_token]];
current_pos_in_char_in_token = 0;
continue;
}
/* We are neither at the end of the tag nor at the end of the token */
if (!is_equal_ignore_case(
tab[current_tag]->content[current_pos_in_char_in_tag],
token[current_pos_in_char_in_token], alph)) {
break;
}
current_pos_in_char_in_tag++;
current_pos_in_char_in_token++;
}
/* Default case: all tags will have the same start/end bounds */
for (int i = 0; i < vector->nbelems; i++) {
tab[i]->start_pos = start;
tab[i]->start_pos_char = 0;
tab[i]->end_pos = end;
tab[i]->end_pos_char = u_strlen(INFO->tok->token[INFO->buffer[end]])
- 1;
}
}
/**
* This function takes an output sequence 's' and adds all the corresponding
* transitions in the given graph. For instance, if we have to add a path
* from the state 4 to 7 corresponding to the sequence "{de,.PREP} {le,.PRO:ms}",
* we will create a new intermediate state, for instance 19, and add two transitions:
*
* 4 -- {de,.PREP} ----> 19
* 19 -- {le,.PRO:ms} --> 7
*/
void add_path_to_sentence_automaton(int start_pos, int end_pos,
int start_state_index, const Alphabet* alph, SingleGraph graph,
struct string_hash* tmp_tags, unichar* s, int destination_state_index,
Ustring* foo, struct info* INFO, Korean* korean) {
//error("start=%d end=%d output=%S\n",start_state_index,destination_state_index,s);
if (!u_strcmp(s, "<E>")) {
/* Special case of the <E> insertion */
add_outgoing_transition(graph->states[start_state_index], 0,
destination_state_index);
}
vector_ptr* vector = tokenize_normalization_output(s, alph);
if (vector == NULL) {
/* If the output to be generated has no interest, we do nothing */
error("Skipping incorrect output <%S>\n", s);
return;
}
solve_alignment_puzzle(vector, start_pos, end_pos, INFO, alph, korean, s);
int current_state = start_state_index;
for (int i = 0; i < vector->nbelems; i++) {
struct output_info* info = (struct output_info*) (vector->tab[i]);
u_sprintf(foo, "@STD\n@%S\n@%d.%d.%d-%d.%d.%d\n.\n", info->output,
info->start_pos, info->start_pos_char, info->start_pos_letter,
info->end_pos, info->end_pos_char, info->end_pos_letter);
if (i == vector->nbelems - 1) {
/* If this is the last transition to create, we make it point to the
* destination state */
add_outgoing_transition(graph->states[current_state],
get_value_index(foo->str, tmp_tags),
destination_state_index);
//error("last transition by %S from %d to %d\n",info->output,current_state,destination_state_index);
} else {
/* If this transition is not the last one, we must create a new state */
add_outgoing_transition(graph->states[current_state],
get_value_index(foo->str, tmp_tags),
graph->number_of_states);
//error("transition by %S from %d to %d\n",info->output,current_state,graph->number_of_states);
current_state = graph->number_of_states;
add_state(graph);
}
}
free_vector_ptr(vector, (void(*)(void*)) free_output_info);
}
/**
* This function explores in parallel the text and the normalization tree.
* If a sequence matches, then we add the transitions corresponding to the
* normalized sequences. For instance, if we have a rule of the form:
*
* j' => {je,.PRO:1s}
*
* we will add a transition by "{je,.PRO:1s}" if we find "j'" in the text. This
* is not the same than the exploration of the dictionary since here we ignore
* the text sequence ("j'" is not taken into account for building the tag).
*
* WARNING: this function MUST NOT BE CALLED when the current token is a space.
*/
void explore_normalization_tree(int first_pos_in_buffer,
int current_pos_in_buffer, int token, struct info* INFO,
SingleGraph graph, struct string_hash* tmp_tags,
struct normalization_tree* norm_tree_node, int first_state_index,
int shift, Ustring* foo, int increment, language_t* language,
Korean* korean) {
struct list_ustring* outputs = norm_tree_node->outputs;
while (outputs != NULL) {
/* If there are outputs, we add paths in the text automaton */
add_path_to_sentence_automaton(first_pos_in_buffer,
current_pos_in_buffer - increment, first_state_index,
INFO->alph, graph, tmp_tags, outputs->string, first_state_index
+ shift - 1, foo, INFO, korean);
outputs = outputs->next;
}
/* Then, we explore the transitions from this node. Note that transitions
* are tagged with token numbers that are unique. So, at most one transition
* can match. */
struct normalization_tree_transition* trans = norm_tree_node->trans;
while (trans != NULL) {
if (trans->token == token) {
/* If we have a transition for the current token */
int increment_loc = 1;
if (INFO->buffer[current_pos_in_buffer + 1] == INFO->SPACE) {
increment_loc++;
}
explore_normalization_tree(first_pos_in_buffer,
current_pos_in_buffer + increment_loc,
INFO->buffer[current_pos_in_buffer + increment_loc], INFO,
graph, tmp_tags, trans->node, first_state_index, shift + 1,
foo, increment_loc, language, korean);
/* As there can be only one matching transition, we exit the while */
trans = NULL;
} else {
trans = trans->next;
}
}
}
/**
* This function builds the sentence automaton that correspond to the
* given token buffer. It saves it into the given file.
*/
void build_sentence_automaton(const int* buffer, int length,
const struct text_tokens* tokens, const struct DELA_tree* DELA_tree,
const Alphabet* alph, U_FILE* out_tfst, U_FILE* out_tind,
int sentence_number, int we_must_clean,
struct normalization_tree* norm_tree, struct match_list* *tag_list,
int current_global_position_in_tokens,
int current_global_position_in_chars, language_t* language,
Korean* korean, struct hash_table* form_frequencies) {
/* We declare the graph that will represent the sentence as well as
* a temporary string_hash 'tmp_tags' that will be used to store the tags of this
* graph. We don't put tags directly in the main 'tags', because a tag can
* be introduced and then removed by cleaning */
Tfst* tfst = new_Tfst(NULL, NULL, 0);
tfst->current_sentence = sentence_number;
tfst->automaton = new_SingleGraph();
tfst->offset_in_tokens = current_global_position_in_tokens;
tfst->offset_in_chars = current_global_position_in_chars;
struct string_hash* tags = new_string_hash(32);
struct string_hash* tmp_tags = new_string_hash(32);
unichar EPSILON_TAG[] = { '@', '<', 'E', '>', '\n', '.', '\n', '\0' };
/* The epsilon tag is always the first one */
get_value_index(EPSILON_TAG, tmp_tags);
get_value_index(EPSILON_TAG, tags);
int i;
/* We add +1 for the final node */
int n_nodes = 1 + count_non_space_tokens(buffer, length, tokens->SPACE);
for (i = 0; i < n_nodes; i++) {
add_state(tfst->automaton);
}
set_initial_state(tfst->automaton->states[0]);
int final_node_index=n_nodes-1;
set_final_state(tfst->automaton->states[n_nodes - 1]);
struct info INFO;
INFO.tok = tokens;
INFO.buffer = buffer;
INFO.alph = alph;
INFO.SPACE = tokens->SPACE;
INFO.length_max = length;
int current_state = 0;
int is_not_unknown_token;
unichar inflected[4096];
/* Temp string used to build tags with bound control */
Ustring* foo = new_Ustring(1);
/* We compute the text and tokens of the sentence */
tfst->tokens = new_vector_int(length);
tfst->token_sizes = new_vector_int(length);
for (int il = 0; il < length; il++) {
vector_int_add(tfst->tokens, buffer[il]);
int l = u_strlen(tokens->token[buffer[il]]);
vector_int_add(tfst->token_sizes, l);
u_strcat(foo, tokens->token[buffer[il]], l);
}
tfst->text = u_strdup(foo->str);
for (i = 0; i < length; i++) {
if (buffer[i] == tokens->SENTENCE_MARKER) {
fatal_error("build_sentence_automaton: unexpected {S} token\n");
}
if (buffer[i] != tokens->SPACE) {
/* We try to produce every transition from the current token */
is_not_unknown_token = 0;
unichar tag_buffer[4096];
explore_dictionary_tree(0, tokens->token[buffer[i]], inflected, 0,
DELA_tree->inflected_forms->root, DELA_tree, &INFO,
tfst->automaton->states[current_state], 1, current_state,
&is_not_unknown_token, i, i, tmp_tags, foo, language,
tag_buffer);
if (norm_tree != NULL) {
/* If there is a normalization tree, we explore it */
explore_normalization_tree(i, i, buffer[i], &INFO,
tfst->automaton, tmp_tags, norm_tree, current_state, 1,
foo, 0, language, korean);
}
if (!is_not_unknown_token) {
/* If the token was not matched in the dictionary, we put it as an unknown one */
u_sprintf(
foo,
"@STD\n@%S\n@%d.0.0-%d.%d.%d\n.\n",
tokens->token[buffer[i]],
i,
i,
tfst->token_sizes->tab[i] - 1,
get_length_in_jamo(
tokens->token[buffer[i]][tfst->token_sizes->tab[i]
- 1], korean) - 1);
int tag_number = get_value_index(foo->str, tmp_tags);
add_outgoing_transition(tfst->automaton->states[current_state],
tag_number, current_state + 1);
}
current_state++;
}
}
/* Now, we insert the tag sequences found in the 'tags.ind' file, if any */
struct match_list* tmp;
while ((*tag_list) != NULL && (*tag_list)->m.start_pos_in_token
>= current_global_position_in_tokens
&& (*tag_list)->m.start_pos_in_token
<= current_global_position_in_tokens + length) {
if ((*tag_list)->m.end_pos_in_token > current_global_position_in_tokens
+ length) {
/* If we have a tag sequence that overlap two sentences, we must ignore it */
tmp = (*tag_list)->next;
free_match_list_element((*tag_list));
(*tag_list) = tmp;
continue;
}
/* We compute the local bounds of the tag sequence */
int start_index = (*tag_list)->m.start_pos_in_token
- current_global_position_in_tokens;
int end_index = (*tag_list)->m.end_pos_in_token
- current_global_position_in_tokens;
int start_pos_in_token = start_index;
int end_pos_in_token = end_index;
/* And we adjust them to our state indexes, because spaces
* must be ignored */
for (int il = start_index; il >= 0; il--) {
if (buffer[il] == tokens->SPACE) {
start_index--;
}
}
for (int il = end_index; il >= 0; il--) {
if (buffer[il] == tokens->SPACE) {
end_index--;
}
}
if (end_index!=final_node_index) {
end_index+=1;
}
add_path_to_sentence_automaton(start_pos_in_token, end_pos_in_token,
start_index, INFO.alph, tfst->automaton, tmp_tags,
(*tag_list)->output, end_index, foo, &INFO, korean);
tmp = (*tag_list)->next;