Skip to content

Commit f523c01

Browse files
authored
Merge pull request #3328 from lonvia/word-count-into-new-table
Recreate word table when refreshing counts
2 parents 33c0f24 + 81eed06 commit f523c01

File tree

10 files changed

+130
-82
lines changed

10 files changed

+130
-82
lines changed

lib-sql/tokenizer/icu_tokenizer_tables.sql

-40
This file was deleted.

nominatim/clicmd/refresh.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t
110110

111111
if args.word_counts:
112112
LOG.warning('Recompute word statistics')
113-
self._get_tokenizer(args.config).update_statistics()
113+
self._get_tokenizer(args.config).update_statistics(args.config)
114114

115115
if args.address_levels:
116116
LOG.warning('Updating address levels')

nominatim/clicmd/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements
169169
tokenizer.finalize_import(args.config)
170170

171171
LOG.warning('Recompute word counts')
172-
tokenizer.update_statistics()
172+
tokenizer.update_statistics(args.config)
173173

174174
webdir = args.project_dir / 'website'
175175
LOG.warning('Setup website at %s', webdir)

nominatim/db/sql_preprocessor.py

+12
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,18 @@ def __init__(self, conn: Connection, config: Configuration) -> None:
9090
self.env.globals['postgres'] = _setup_postgresql_features(conn)
9191

9292

93+
def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
94+
""" Execute the given SQL template string on the connection.
95+
The keyword arguments may supply additional parameters
96+
for preprocessing.
97+
"""
98+
sql = self.env.from_string(template).render(**kwargs)
99+
100+
with conn.cursor() as cur:
101+
cur.execute(sql)
102+
conn.commit()
103+
104+
93105
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
94106
""" Execute the given SQL file on the connection. The keyword arguments
95107
may supply additional parameters for preprocessing.

nominatim/tokenizer/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def check_database(self, config: Configuration) -> Optional[str]:
201201

202202

203203
@abstractmethod
204-
def update_statistics(self) -> None:
204+
def update_statistics(self, config: Configuration) -> None:
205205
""" Recompute any tokenizer statistics necessary for efficient lookup.
206206
This function is meant to be called from time to time by the user
207207
to improve performance. However, the tokenizer must not depend on

nominatim/tokenizer/icu_tokenizer.py

+100-20
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131

3232
LOG = logging.getLogger()
3333

34+
WORD_TYPES =(('country_names', 'C'),
35+
('postcodes', 'P'),
36+
('full_word', 'W'),
37+
('housenumbers', 'H'))
38+
3439
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
3540
""" Create a new instance of the tokenizer provided by this module.
3641
"""
@@ -62,7 +67,8 @@ def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
6267

6368
if init_db:
6469
self.update_sql_functions(config)
65-
self._init_db_tables(config)
70+
self._setup_db_tables(config, 'word')
71+
self._create_base_indices(config, 'word')
6672

6773

6874
def init_from_project(self, config: Configuration) -> None:
@@ -80,9 +86,7 @@ def finalize_import(self, config: Configuration) -> None:
8086
""" Do any required postprocessing to make the tokenizer data ready
8187
for use.
8288
"""
83-
with connect(self.dsn) as conn:
84-
sqlp = SQLPreprocessor(conn, config)
85-
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
89+
self._create_lookup_indices(config, 'word')
8690

8791

8892
def update_sql_functions(self, config: Configuration) -> None:
@@ -100,24 +104,35 @@ def check_database(self, config: Configuration) -> None:
100104
self.init_from_project(config)
101105

102106

103-
def update_statistics(self) -> None:
107+
def update_statistics(self, config: Configuration) -> None:
104108
""" Recompute frequencies for all name words.
105109
"""
106110
with connect(self.dsn) as conn:
107-
if conn.table_exists('search_name'):
108-
with conn.cursor() as cur:
109-
cur.drop_table("word_frequencies")
110-
LOG.info("Computing word frequencies")
111-
cur.execute("""CREATE TEMP TABLE word_frequencies AS
112-
SELECT unnest(name_vector) as id, count(*)
113-
FROM search_name GROUP BY id""")
114-
cur.execute("CREATE INDEX ON word_frequencies(id)")
115-
LOG.info("Update word table with recomputed frequencies")
116-
cur.execute("""UPDATE word
117-
SET info = info || jsonb_build_object('count', count)
118-
FROM word_frequencies WHERE word_id = id""")
119-
cur.drop_table("word_frequencies")
111+
if not conn.table_exists('search_name'):
112+
return
113+
114+
with conn.cursor() as cur:
115+
LOG.info('Computing word frequencies')
116+
cur.drop_table('word_frequencies')
117+
cur.execute("""CREATE TEMP TABLE word_frequencies AS
118+
SELECT unnest(name_vector) as id, count(*)
119+
FROM search_name GROUP BY id""")
120+
cur.execute('CREATE INDEX ON word_frequencies(id)')
121+
LOG.info('Update word table with recomputed frequencies')
122+
cur.drop_table('tmp_word')
123+
cur.execute("""CREATE TABLE tmp_word AS
124+
SELECT word_id, word_token, type, word,
125+
(CASE WHEN wf.count is null THEN info
126+
ELSE info || jsonb_build_object('count', wf.count)
127+
END) as info
128+
FROM word LEFT JOIN word_frequencies wf
129+
ON word.word_id = wf.id""")
130+
cur.drop_table('word_frequencies')
120131
conn.commit()
132+
self._create_base_indices(config, 'tmp_word')
133+
self._create_lookup_indices(config, 'tmp_word')
134+
self._move_temporary_word_table('tmp_word')
135+
121136

122137

123138
def _cleanup_housenumbers(self) -> None:
@@ -219,16 +234,81 @@ def _save_config(self) -> None:
219234
self.loader.save_config_to_db(conn)
220235

221236

222-
def _init_db_tables(self, config: Configuration) -> None:
237+
def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
238+
""" Set up the word table and fill it with pre-computed word
239+
frequencies.
240+
"""
241+
with connect(self.dsn) as conn:
242+
with conn.cursor() as cur:
243+
cur.drop_table(table_name)
244+
sqlp = SQLPreprocessor(conn, config)
245+
sqlp.run_string(conn, """
246+
CREATE TABLE {{table_name}} (
247+
word_id INTEGER,
248+
word_token text NOT NULL,
249+
type text NOT NULL,
250+
word text,
251+
info jsonb
252+
) {{db.tablespace.search_data}};
253+
GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
254+
255+
DROP SEQUENCE IF EXISTS seq_{{table_name}};
256+
CREATE SEQUENCE seq_{{table_name}} start 1;
257+
GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
258+
""", table_name=table_name)
259+
260+
261+
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
223262
""" Set up the word table and fill it with pre-computed word
224263
frequencies.
225264
"""
226265
with connect(self.dsn) as conn:
227266
sqlp = SQLPreprocessor(conn, config)
228-
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
267+
sqlp.run_string(conn,
268+
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
269+
USING BTREE (word_token) {{db.tablespace.search_index}}""",
270+
table_name=table_name)
271+
for name, ctype in WORD_TYPES:
272+
sqlp.run_string(conn,
273+
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
274+
USING BTREE (word) {{db.tablespace.address_index}}
275+
WHERE type = '{{column_type}}'
276+
""",
277+
table_name=table_name, idx_name=name,
278+
column_type=ctype)
279+
280+
281+
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
282+
""" Create addtional indexes used when running the API.
283+
"""
284+
with connect(self.dsn) as conn:
285+
sqlp = SQLPreprocessor(conn, config)
286+
# Index required for details lookup.
287+
sqlp.run_string(conn, """
288+
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
289+
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
290+
""",
291+
table_name=table_name)
292+
293+
294+
def _move_temporary_word_table(self, old: str) -> None:
295+
""" Rename all tables and indexes used by the tokenizer.
296+
"""
297+
with connect(self.dsn) as conn:
298+
with conn.cursor() as cur:
299+
cur.drop_table('word')
300+
cur.execute(f"ALTER TABLE {old} RENAME TO word")
301+
for idx in ('word_token', 'word_id'):
302+
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
303+
RENAME TO idx_word_{idx}""")
304+
for name, _ in WORD_TYPES:
305+
cur.execute(f"""ALTER INDEX idx_{old}_{name}
306+
RENAME TO idx_word_{name}""")
229307
conn.commit()
230308

231309

310+
311+
232312
class ICUNameAnalyzer(AbstractAnalyzer):
233313
""" The ICU analyzer uses the ICU library for splitting names.
234314

nominatim/tokenizer/legacy_tokenizer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def migrate_database(self, config: Configuration) -> None:
210210
self._save_config(conn, config)
211211

212212

213-
def update_statistics(self) -> None:
213+
def update_statistics(self, _: Configuration) -> None:
214214
""" Recompute the frequency of full words.
215215
"""
216216
with connect(self.dsn) as conn:

test/python/cli/conftest.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ def update_sql_functions(self, *args):
3838
def finalize_import(self, *args):
3939
self.finalize_import_called = True
4040

41-
def update_statistics(self):
41+
def update_statistics(self, *args):
4242
self.update_statistics_called = True
4343

44-
def update_word_tokens(self):
44+
def update_word_tokens(self, *args):
4545
self.update_word_tokens_called = True
4646

4747

test/python/tokenizer/test_icu.py

+8-12
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
"""
88
Tests for ICU tokenizer.
99
"""
10-
import shutil
1110
import yaml
1211
import itertools
1312

@@ -32,8 +31,6 @@ def test_config(project_env, tmp_path):
3231
sqldir.mkdir()
3332
(sqldir / 'tokenizer').mkdir()
3433
(sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
35-
shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
36-
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
3734

3835
project_env.lib_dir.sql = sqldir
3936

@@ -204,16 +201,14 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
204201

205202
def test_finalize_import(tokenizer_factory, temp_db_conn,
206203
temp_db_cursor, test_config, sql_preprocessor_cfg):
207-
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql'
208-
func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT
209-
AS $$ SELECT 'b'::text $$ LANGUAGE SQL""")
210-
211204
tok = tokenizer_factory()
212205
tok.init_new_db(test_config)
213206

207+
assert not temp_db_conn.index_exists('idx_word_word_id')
208+
214209
tok.finalize_import(test_config)
215210

216-
temp_db_cursor.scalar('SELECT test()') == 'b'
211+
assert temp_db_conn.index_exists('idx_word_word_id')
217212

218213

219214
def test_check_database(test_config, tokenizer_factory,
@@ -224,19 +219,20 @@ def test_check_database(test_config, tokenizer_factory,
224219
assert tok.check_database(test_config) is None
225220

226221

227-
def test_update_statistics_reverse_only(word_table, tokenizer_factory):
222+
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
228223
tok = tokenizer_factory()
229-
tok.update_statistics()
224+
tok.update_statistics(test_config)
230225

231226

232-
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
227+
def test_update_statistics(word_table, table_factory, temp_db_cursor,
228+
tokenizer_factory, test_config):
233229
word_table.add_full_word(1000, 'hello')
234230
table_factory('search_name',
235231
'place_id BIGINT, name_vector INT[]',
236232
[(12, [1000])])
237233
tok = tokenizer_factory()
238234

239-
tok.update_statistics()
235+
tok.update_statistics(test_config)
240236

241237
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
242238
WHERE type = 'W' and

test/python/tokenizer/test_legacy.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -238,19 +238,19 @@ def test_check_database_bad_setup(test_config, tokenizer_factory, monkeypatch,
238238
assert tok.check_database(False) is not None
239239

240240

241-
def test_update_statistics_reverse_only(word_table, tokenizer_factory):
241+
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
242242
tok = tokenizer_factory()
243-
tok.update_statistics()
243+
tok.update_statistics(test_config)
244244

245245

246-
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
246+
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config):
247247
word_table.add_full_word(1000, 'hello')
248248
table_factory('search_name',
249249
'place_id BIGINT, name_vector INT[]',
250250
[(12, [1000])])
251251
tok = tokenizer_factory()
252252

253-
tok.update_statistics()
253+
tok.update_statistics(test_config)
254254

255255
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
256256
WHERE word_token like ' %' and

0 commit comments

Comments
 (0)