Skip to content

Commit ad95ff1

Browse files
authored
Merge pull request #3424 from lonvia/importance-csc-import
Implement a lighter format for wikipedia importance tables
2 parents 5c70739 + 90eea6b commit ad95ff1

File tree

10 files changed

+240
-46
lines changed

10 files changed

+240
-46
lines changed

lib-sql/functions/importance.sql

+59-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,54 @@ CREATE TYPE place_importance as (
2020
wikipedia TEXT
2121
);
2222

23+
{% if 'wikimedia_importance' in db.tables %}
24+
25+
CREATE OR REPLACE FUNCTION get_wikipedia_match(extratags HSTORE, country_code varchar(2))
26+
RETURNS wikipedia_article_match
27+
AS $$
28+
DECLARE
29+
i INT;
30+
wiki_article_title TEXT;
31+
wiki_article_language TEXT;
32+
result wikipedia_article_match;
33+
entry RECORD;
34+
BEGIN
35+
IF extratags ? 'wikipedia' and strpos(extratags->'wikipedia', ':') IN (3,4) THEN
36+
wiki_article_language := lower(trim(split_part(extratags->'wikipedia', ':', 1)));
37+
wiki_article_title := trim(substr(extratags->'wikipedia',
38+
strpos(extratags->'wikipedia', ':') + 1));
39+
40+
FOR result IN
41+
SELECT language, title, importance FROM wikimedia_importance
42+
WHERE language = wiki_article_language
43+
and title = replace(wiki_article_title, ' ', '_')
44+
LOOP
45+
RETURN result;
46+
END LOOP;
47+
END IF;
48+
49+
FOREACH wiki_article_language IN ARRAY ARRAY['ar','bg','ca','cs','da','de','en','es','eo','eu','fa','fr','ko','hi','hr','id','it','he','lt','hu','ms','nl','ja','no','pl','pt','kk','ro','ru','sk','sl','sr','fi','sv','tr','uk','vi','vo','war','zh']
50+
LOOP
51+
IF extratags ? ('wikipedia:' || wiki_article_language) THEN
52+
wiki_article_title := extratags->('wikipedia:' || wiki_article_language);
53+
54+
FOR result IN
55+
SELECT language, title, importance FROM wikimedia_importance
56+
WHERE language = wiki_article_language
57+
and title = replace(wiki_article_title, ' ', '_')
58+
LOOP
59+
RETURN result;
60+
END LOOP;
61+
END IF;
62+
63+
END LOOP;
64+
65+
RETURN NULL;
66+
END;
67+
$$
68+
LANGUAGE plpgsql IMMUTABLE;
69+
70+
{% else %}
2371

2472
-- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql
2573
CREATE OR REPLACE FUNCTION decode_url_part(p varchar)
@@ -93,6 +141,7 @@ END;
93141
$$
94142
LANGUAGE plpgsql STABLE;
95143

144+
{% endif %}
96145

97146
CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE,
98147
country_code varchar(2),
@@ -118,9 +167,16 @@ BEGIN
118167

119168
-- Nothing? Then try with the wikidata tag.
120169
IF result.importance is null AND extratags ? 'wikidata' THEN
121-
FOR match IN SELECT * FROM wikipedia_article
122-
WHERE wd_page_title = extratags->'wikidata'
123-
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1
170+
FOR match IN
171+
{% if 'wikimedia_importance' in db.tables %}
172+
SELECT * FROM wikimedia_importance
173+
WHERE wikidata = extratags->'wikidata'
174+
LIMIT 1
175+
{% else %}
176+
SELECT * FROM wikipedia_article
177+
WHERE wd_page_title = extratags->'wikidata'
178+
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1
179+
{% endif %}
124180
LOOP
125181
result.importance := match.importance;
126182
result.wikipedia := match.language || ':' || match.title;

lib-sql/tables.sql

+9-22
Original file line numberDiff line numberDiff line change
@@ -273,28 +273,15 @@ GRANT SELECT ON import_polygon_delete TO "{{config.DATABASE_WEBUSER}}";
273273
DROP SEQUENCE IF EXISTS file;
274274
CREATE SEQUENCE file start 1;
275275

276-
-- null table so it won't error
277-
-- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it
278-
CREATE TABLE IF NOT EXISTS wikipedia_article (
279-
language text NOT NULL,
280-
title text NOT NULL,
281-
langcount integer,
282-
othercount integer,
283-
totalcount integer,
284-
lat double precision,
285-
lon double precision,
286-
importance double precision,
287-
osm_type character(1),
288-
osm_id bigint,
289-
wd_page_title text,
290-
instance_of text
291-
);
292-
293-
CREATE TABLE IF NOT EXISTS wikipedia_redirect (
294-
language text,
295-
from_title text,
296-
to_title text
297-
);
276+
{% if 'wikimedia_importance' not in db.tables and 'wikipedia_article' not in db.tables %}
277+
-- create dummy tables here, if nothing was imported
278+
CREATE TABLE wikimedia_importance (
279+
language TEXT NOT NULL,
280+
title TEXT NOT NULL,
281+
importance double precision NOT NULL,
282+
wikidata TEXT
283+
) {{db.tablespace.address_data}};
284+
{% endif %}
298285

299286
-- osm2pgsql does not create indexes on the middle tables for Nominatim
300287
-- Add one for lookup of associated street relations.

nominatim/clicmd/refresh.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t
8989
from ..tools import refresh, postcodes
9090
from ..indexer.indexer import Indexer
9191

92+
need_function_refresh = args.functions
9293

9394
if args.postcodes:
9495
if postcodes.can_compute(args.config.get_libpq_dsn()):
@@ -131,13 +132,7 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t
131132
args.project_dir) > 0:
132133
LOG.fatal('FATAL: Cannot update secondary importance raster data')
133134
return 1
134-
135-
if args.functions:
136-
LOG.warning('Create functions')
137-
with connect(args.config.get_libpq_dsn()) as conn:
138-
refresh.create_functions(conn, args.config,
139-
args.diffs, args.enable_debug_statements)
140-
self._get_tokenizer(args.config).update_sql_functions(args.config)
135+
need_function_refresh = True
141136

142137
if args.wiki_data:
143138
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
@@ -147,8 +142,16 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t
147142
data_path) > 0:
148143
LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
149144
return 1
145+
need_function_refresh = True
146+
147+
if need_function_refresh:
148+
LOG.warning('Create functions')
149+
with connect(args.config.get_libpq_dsn()) as conn:
150+
refresh.create_functions(conn, args.config,
151+
args.diffs, args.enable_debug_statements)
152+
self._get_tokenizer(args.config).update_sql_functions(args.config)
150153

151-
# Attention: importance MUST come after wiki data import.
154+
# Attention: importance MUST come after wiki data import and after functions.
152155
if args.importance:
153156
LOG.warning('Update importance values for database')
154157
with connect(args.config.get_libpq_dsn()) as conn:

nominatim/db/utils.py

+8
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ def __enter__(self) -> 'CopyBuffer':
9292
return self
9393

9494

95+
def size(self) -> int:
96+
""" Return the number of bytes the buffer currently contains.
97+
"""
98+
return self.buffer.tell()
99+
95100
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
96101
if self.buffer is not None:
97102
self.buffer.close()
@@ -115,7 +120,10 @@ def add(self, *data: Any) -> None:
115120

116121
def copy_out(self, cur: Cursor, table: str, columns: Optional[Iterable[str]] = None) -> None:
117122
""" Copy all collected data into the given table.
123+
124+
The buffer is empty and reusable after this operation.
118125
"""
119126
if self.buffer.tell() > 0:
120127
self.buffer.seek(0)
121128
cur.copy_from(self.buffer, table, columns=columns)
129+
self.buffer = io.StringIO()

nominatim/tools/check_database.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,10 @@ def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult
248248
return CheckState.NOT_APPLICABLE
249249

250250
with conn.cursor() as cur:
251-
cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')
251+
if conn.table_exists('wikimedia_importance'):
252+
cnt = cur.scalar('SELECT count(*) FROM wikimedia_importance')
253+
else:
254+
cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')
252255

253256
return CheckState.WARN if cnt == 0 else CheckState.OK
254257

nominatim/tools/refresh.py

+75-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
Functions for bringing auxiliary data in the database up-to-date.
99
"""
1010
from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
11+
import csv
12+
import gzip
1113
import logging
1214
from textwrap import dedent
1315
from pathlib import Path
@@ -16,7 +18,7 @@
1618

1719
from nominatim.config import Configuration
1820
from nominatim.db.connection import Connection, connect
19-
from nominatim.db.utils import execute_file
21+
from nominatim.db.utils import execute_file, CopyBuffer
2022
from nominatim.db.sql_preprocessor import SQLPreprocessor
2123
from nominatim.version import NOMINATIM_VERSION
2224

@@ -132,21 +134,89 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F
132134
Returns 0 if all was well and 1 if the importance file could not
133135
be found. Throws an exception if there was an error reading the file.
134136
"""
135-
datafile = data_path / 'wikimedia-importance.sql.gz'
137+
if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
138+
or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
139+
ignore_errors) == 0:
140+
return 0
136141

137-
if not datafile.exists():
142+
return 1
143+
144+
145+
def import_importance_csv(dsn: str, data_file: Path) -> int:
146+
""" Replace wikipedia importance table with data from a
147+
single CSV file.
148+
149+
The file must be a gzipped CSV and have the following columns:
150+
language, title, importance, wikidata_id
151+
152+
Other columns may be present but will be ignored.
153+
"""
154+
if not data_file.exists():
155+
return 1
156+
157+
# Only import the first occurance of a wikidata ID.
158+
# This keeps indexes and table small.
159+
wd_done = set()
160+
161+
with connect(dsn) as conn:
162+
with conn.cursor() as cur:
163+
cur.drop_table('wikipedia_article')
164+
cur.drop_table('wikipedia_redirect')
165+
cur.drop_table('wikimedia_importance')
166+
cur.execute("""CREATE TABLE wikimedia_importance (
167+
language TEXT NOT NULL,
168+
title TEXT NOT NULL,
169+
importance double precision NOT NULL,
170+
wikidata TEXT
171+
) """)
172+
173+
with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
174+
for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
175+
wd_id = int(row['wikidata_id'][1:])
176+
buf.add(row['language'], row['title'], row['importance'],
177+
None if wd_id in wd_done else row['wikidata_id'])
178+
wd_done.add(wd_id)
179+
180+
if buf.size() > 10000000:
181+
with conn.cursor() as cur:
182+
buf.copy_out(cur, 'wikimedia_importance',
183+
columns=['language', 'title', 'importance',
184+
'wikidata'])
185+
186+
with conn.cursor() as cur:
187+
buf.copy_out(cur, 'wikimedia_importance',
188+
columns=['language', 'title', 'importance', 'wikidata'])
189+
190+
with conn.cursor() as cur:
191+
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
192+
ON wikimedia_importance (title)""")
193+
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
194+
ON wikimedia_importance (wikidata)
195+
WHERE wikidata is not null""")
196+
197+
conn.commit()
198+
199+
return 0
200+
201+
202+
def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
203+
""" Replace wikipedia importance table with data from an SQL file.
204+
"""
205+
if not data_file.exists():
138206
return 1
139207

140208
pre_code = """BEGIN;
141209
DROP TABLE IF EXISTS "wikipedia_article";
142-
DROP TABLE IF EXISTS "wikipedia_redirect"
210+
DROP TABLE IF EXISTS "wikipedia_redirect";
211+
DROP TABLE IF EXISTS "wikipedia_importance";
143212
"""
144213
post_code = "COMMIT"
145-
execute_file(dsn, datafile, ignore_errors=ignore_errors,
214+
execute_file(dsn, data_file, ignore_errors=ignore_errors,
146215
pre_code=pre_code, post_code=post_code)
147216

148217
return 0
149218

219+
150220
def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
151221
""" Replaces the secondary importance raster data table with new data.
152222

test/python/cli/test_cmd_refresh.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def setup_cli_call(self, cli_call, temp_db, cli_tokenizer_mock):
2828
('website', 'setup_website'),
2929
])
3030
def test_refresh_command(self, mock_func_factory, command, func):
31+
mock_func_factory(nominatim.tools.refresh, 'create_functions')
3132
func_mock = mock_func_factory(nominatim.tools.refresh, func)
3233

3334
assert self.call_nominatim('refresh', '--' + command) == 0
@@ -71,6 +72,7 @@ def test_refresh_wikidata_file_not_found(self, monkeypatch):
7172

7273
assert self.call_nominatim('refresh', '--wiki-data') == 1
7374

75+
7476
def test_refresh_secondary_importance_file_not_found(self):
7577
assert self.call_nominatim('refresh', '--secondary-importance') == 1
7678

@@ -84,16 +86,18 @@ def test_refresh_secondary_importance_new_table(self, mock_func_factory):
8486
assert mocks[1].called == 1
8587

8688

87-
def test_refresh_importance_computed_after_wiki_import(self, monkeypatch):
89+
def test_refresh_importance_computed_after_wiki_import(self, monkeypatch, mock_func_factory):
8890
calls = []
8991
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
9092
lambda *args, **kwargs: calls.append('import') or 0)
9193
monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance',
9294
lambda *args, **kwargs: calls.append('update'))
95+
func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
9396

9497
assert self.call_nominatim('refresh', '--importance', '--wiki-data') == 0
9598

9699
assert calls == ['import', 'update']
100+
assert func_mock.called == 1
97101

98102
@pytest.mark.parametrize('params', [('--data-object', 'w234'),
99103
('--data-object', 'N23', '--data-object', 'N24'),

test/python/mocks.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,17 @@ class text,
5454

5555
def add(self, osm_type='N', osm_id=None, cls='amenity', typ='cafe', names=None,
5656
admin_level=None, address=None, extratags=None, geom='POINT(10 4)',
57-
country=None, housenumber=None):
57+
country=None, housenumber=None, rank_search=30):
5858
with self.conn.cursor() as cur:
5959
psycopg2.extras.register_hstore(cur)
6060
cur.execute("""INSERT INTO placex (place_id, osm_type, osm_id, class,
6161
type, name, admin_level, address,
62-
housenumber,
62+
housenumber, rank_search,
6363
extratags, geometry, country_code)
64-
VALUES(nextval('seq_place'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
64+
VALUES(nextval('seq_place'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
6565
(osm_type, osm_id or next(self.idseq), cls, typ, names,
66-
admin_level, address, housenumber, extratags, 'SRID=4326;' + geom,
66+
admin_level, address, housenumber, rank_search,
67+
extratags, 'SRID=4326;' + geom,
6768
country))
6869
self.conn.commit()
6970

test/python/tools/test_refresh.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@ def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn,
3535
@pytest.mark.parametrize("replace", (True, False))
3636
def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, replace):
3737
if replace:
38-
table_factory('wikipedia_article')
39-
table_factory('wikipedia_redirect')
38+
table_factory('wikimedia_importance')
4039

4140
# use the small wikipedia file for the API testdb
4241
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0

0 commit comments

Comments
 (0)