31
31
32
32
LOG = logging .getLogger ()
33
33
34
+ WORD_TYPES = (('country_names' , 'C' ),
35
+ ('postcodes' , 'P' ),
36
+ ('full_word' , 'W' ),
37
+ ('housenumbers' , 'H' ))
38
+
34
39
def create (dsn : str , data_dir : Path ) -> 'ICUTokenizer' :
35
40
""" Create a new instance of the tokenizer provided by this module.
36
41
"""
@@ -62,7 +67,8 @@ def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
62
67
63
68
if init_db :
64
69
self .update_sql_functions (config )
65
- self ._init_db_tables (config )
70
+ self ._setup_db_tables (config , 'word' )
71
+ self ._create_base_indices (config , 'word' )
66
72
67
73
68
74
def init_from_project (self , config : Configuration ) -> None :
@@ -80,9 +86,7 @@ def finalize_import(self, config: Configuration) -> None:
80
86
""" Do any required postprocessing to make the tokenizer data ready
81
87
for use.
82
88
"""
83
- with connect (self .dsn ) as conn :
84
- sqlp = SQLPreprocessor (conn , config )
85
- sqlp .run_sql_file (conn , 'tokenizer/legacy_tokenizer_indices.sql' )
89
+ self ._create_lookup_indices (config , 'word' )
86
90
87
91
88
92
def update_sql_functions (self , config : Configuration ) -> None :
@@ -100,24 +104,35 @@ def check_database(self, config: Configuration) -> None:
100
104
self .init_from_project (config )
101
105
102
106
103
- def update_statistics (self ) -> None :
107
+ def update_statistics (self , config : Configuration ) -> None :
104
108
""" Recompute frequencies for all name words.
105
109
"""
106
110
with connect (self .dsn ) as conn :
107
- if conn .table_exists ('search_name' ):
108
- with conn .cursor () as cur :
109
- cur .drop_table ("word_frequencies" )
110
- LOG .info ("Computing word frequencies" )
111
- cur .execute ("""CREATE TEMP TABLE word_frequencies AS
112
- SELECT unnest(name_vector) as id, count(*)
113
- FROM search_name GROUP BY id""" )
114
- cur .execute ("CREATE INDEX ON word_frequencies(id)" )
115
- LOG .info ("Update word table with recomputed frequencies" )
116
- cur .execute ("""UPDATE word
117
- SET info = info || jsonb_build_object('count', count)
118
- FROM word_frequencies WHERE word_id = id""" )
119
- cur .drop_table ("word_frequencies" )
111
+ if not conn .table_exists ('search_name' ):
112
+ return
113
+
114
+ with conn .cursor () as cur :
115
+ LOG .info ('Computing word frequencies' )
116
+ cur .drop_table ('word_frequencies' )
117
+ cur .execute ("""CREATE TEMP TABLE word_frequencies AS
118
+ SELECT unnest(name_vector) as id, count(*)
119
+ FROM search_name GROUP BY id""" )
120
+ cur .execute ('CREATE INDEX ON word_frequencies(id)' )
121
+ LOG .info ('Update word table with recomputed frequencies' )
122
+ cur .drop_table ('tmp_word' )
123
+ cur .execute ("""CREATE TABLE tmp_word AS
124
+ SELECT word_id, word_token, type, word,
125
+ (CASE WHEN wf.count is null THEN info
126
+ ELSE info || jsonb_build_object('count', wf.count)
127
+ END) as info
128
+ FROM word LEFT JOIN word_frequencies wf
129
+ ON word.word_id = wf.id""" )
130
+ cur .drop_table ('word_frequencies' )
120
131
conn .commit ()
132
+ self ._create_base_indices (config , 'tmp_word' )
133
+ self ._create_lookup_indices (config , 'tmp_word' )
134
+ self ._move_temporary_word_table ('tmp_word' )
135
+
121
136
122
137
123
138
def _cleanup_housenumbers (self ) -> None :
@@ -219,16 +234,81 @@ def _save_config(self) -> None:
219
234
self .loader .save_config_to_db (conn )
220
235
221
236
222
- def _init_db_tables (self , config : Configuration ) -> None :
237
+ def _setup_db_tables (self , config : Configuration , table_name : str ) -> None :
238
+ """ Set up the word table and fill it with pre-computed word
239
+ frequencies.
240
+ """
241
+ with connect (self .dsn ) as conn :
242
+ with conn .cursor () as cur :
243
+ cur .drop_table (table_name )
244
+ sqlp = SQLPreprocessor (conn , config )
245
+ sqlp .run_string (conn , """
246
+ CREATE TABLE {{table_name}} (
247
+ word_id INTEGER,
248
+ word_token text NOT NULL,
249
+ type text NOT NULL,
250
+ word text,
251
+ info jsonb
252
+ ) {{db.tablespace.search_data}};
253
+ GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
254
+
255
+ DROP SEQUENCE IF EXISTS seq_{{table_name}};
256
+ CREATE SEQUENCE seq_{{table_name}} start 1;
257
+ GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
258
+ """ , table_name = table_name )
259
+
260
+
261
+ def _create_base_indices (self , config : Configuration , table_name : str ) -> None :
223
262
""" Set up the word table and fill it with pre-computed word
224
263
frequencies.
225
264
"""
226
265
with connect (self .dsn ) as conn :
227
266
sqlp = SQLPreprocessor (conn , config )
228
- sqlp .run_sql_file (conn , 'tokenizer/icu_tokenizer_tables.sql' )
267
+ sqlp .run_string (conn ,
268
+ """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
269
+ USING BTREE (word_token) {{db.tablespace.search_index}}""" ,
270
+ table_name = table_name )
271
+ for name , ctype in WORD_TYPES :
272
+ sqlp .run_string (conn ,
273
+ """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
274
+ USING BTREE (word) {{db.tablespace.address_index}}
275
+ WHERE type = '{{column_type}}'
276
+ """ ,
277
+ table_name = table_name , idx_name = name ,
278
+ column_type = ctype )
279
+
280
+
281
+ def _create_lookup_indices (self , config : Configuration , table_name : str ) -> None :
282
+ """ Create addtional indexes used when running the API.
283
+ """
284
+ with connect (self .dsn ) as conn :
285
+ sqlp = SQLPreprocessor (conn , config )
286
+ # Index required for details lookup.
287
+ sqlp .run_string (conn , """
288
+ CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
289
+ ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
290
+ """ ,
291
+ table_name = table_name )
292
+
293
+
294
+ def _move_temporary_word_table (self , old : str ) -> None :
295
+ """ Rename all tables and indexes used by the tokenizer.
296
+ """
297
+ with connect (self .dsn ) as conn :
298
+ with conn .cursor () as cur :
299
+ cur .drop_table ('word' )
300
+ cur .execute (f"ALTER TABLE { old } RENAME TO word" )
301
+ for idx in ('word_token' , 'word_id' ):
302
+ cur .execute (f"""ALTER INDEX idx_{ old } _{ idx }
303
+ RENAME TO idx_word_{ idx } """ )
304
+ for name , _ in WORD_TYPES :
305
+ cur .execute (f"""ALTER INDEX idx_{ old } _{ name }
306
+ RENAME TO idx_word_{ name } """ )
229
307
conn .commit ()
230
308
231
309
310
+
311
+
232
312
class ICUNameAnalyzer (AbstractAnalyzer ):
233
313
""" The ICU analyzer uses the ICU library for splitting names.
234
314
0 commit comments