forked from UnitexGramLab/unitex-core
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAlphabet.h
115 lines (95 loc) · 4.27 KB
/
Alphabet.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/*
* Unitex
*
* Copyright (C) 2001-2018 Université Paris-Est Marne-la-Vallée <[email protected]>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*
*/
#ifndef AlphabetH
#define AlphabetH
#include "Unicode.h"
#ifndef HAS_UNITEX_NAMESPACE
#define HAS_UNITEX_NAMESPACE 1
#endif
namespace unitex {
struct alphabet_ {
//unichar* t[0x10000]; // obsolete
// t['e']= "E{E+'}" where {E+'} stands for the unicode
// character representing the E with accent
//char t2[0x10000]; // obsolete
int i_last_array_pos_used;
int i_nb_array_pos_allocated;
unichar** t_array_collection;
// t_array_collection[pos_in_represent_list['e'] = "E{E+'}" where {E+'} stands for the unicode
// character representing the E with accent
uint16_t pos_in_represent_list[0x10000];
unsigned char array_case_flags[(0x10000*2)/8];
// for a char c, IS_UPPER_MACRO(c,alphabet) != 0
// tCaseFlags[(c >> 2)] & (1 << ((c & 3)*2)) != 0 -> c is an uppercase letter
// (by example, c=='E')
//
// for a char c, IS_LOWER_MACRO(c,alphabet) != 0
// tCaseFlags[(c >> 2)] & (1 << (((c & 3)*2)+1)) != 0 -> c is an lowercase letter
// (by example, c=='e')
//
// CASE_FLAG_MACRO(c,alphabet) == 3
// ((tCaseFlags[(c >> 2)] & (1 << ((c & 3)*2)) != 0) &&
// (tCaseFlags[(c >> 2)] & (1 << (((c & 3)*2)+1)) != 0)) -> c is a non variable letter (Thai, Chinese, ...)
//
// CASE_FLAG_MACRO(c,alphabet) == 3
// ((tCaseFlags[(c >> 2)] & (1 << ((c & 3)*2)) == 0) &&
// (tCaseFlags[(c >> 2)] & (1 << (((c & 3)*2)+1)) == 0)) -> c is a non letter
//
// remember : (c >> 2) == (c / 4)
// (c & 3) == (c % 4)
// (x << 1) == (x * 2)
/* This array is only used for Korean alphabets, because it is useful to
* know for a given Chinese character its Hangul syllable equivalent */
unichar* korean_equivalent_syllable;
};
typedef struct alphabet_ Alphabet;
#define ARRAY_ITEM(c) ((c) >> 2)
#define SHIFT_BIT(c) (((c) & 3) << 1)
#define SET_CASE_FLAG_MACRO(c,alphabet,value) \
((alphabet)->array_case_flags[ARRAY_ITEM(c)] |= ((value) << SHIFT_BIT(c)))
#define CASE_FLAG_MACRO(c,alphabet) \
((((alphabet)->array_case_flags[ARRAY_ITEM(c)]) >> SHIFT_BIT(c)) & 3)
#define IS_LOWER_MACRO(c,alphabet) \
((CASE_FLAG_MACRO(c,alphabet)) & 2)
#define IS_UPPER_MACRO(c,alphabet) \
((CASE_FLAG_MACRO(c,alphabet)) & 1)
Alphabet* load_alphabet(const VersatileEncodingConfig*,const char*);
Alphabet* load_alphabet(const VersatileEncodingConfig*,const char*,int);
int is_abstract_or_persistent_alphabet_filename(const char* filename);
void free_alphabet(Alphabet*);
int is_upper_of(unichar,unichar,const Alphabet*);
int is_equal_ignore_case(unichar,unichar,const Alphabet*);
int is_equal_or_uppercase(unichar,unichar,const Alphabet*);
int is_equal_or_uppercase(const unichar*,const unichar*,const Alphabet*);
int is_equal_or_uppercase_qp(const unichar*,const unichar*,const Alphabet*);
int is_lower(unichar,const Alphabet*);
int is_upper(unichar,const Alphabet*);
int is_letter(unichar,const Alphabet*);
int is_sequence_of_lowercase_letters(const unichar*,const Alphabet*);
int is_sequence_of_uppercase_letters(const unichar*,const Alphabet*);
int is_sequence_of_letters(const unichar*,const Alphabet*);
void turn_portuguese_sequence_to_lowercase(unichar*);
void replace_letter_by_letter_set(const Alphabet*,unichar*,const unichar*);
int get_longuest_prefix_ignoring_case(const unichar*,const unichar*,const Alphabet*);
int load_persistent_alphabet(const char* name);
void free_persistent_alphabet(const char* name);
} // namespace unitex
#endif