|
8 | 8 | Functions for bringing auxiliary data in the database up-to-date.
|
9 | 9 | """
|
10 | 10 | from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
|
| 11 | +import csv |
| 12 | +import gzip |
11 | 13 | import logging
|
12 | 14 | from textwrap import dedent
|
13 | 15 | from pathlib import Path
|
|
16 | 18 |
|
17 | 19 | from nominatim.config import Configuration
|
18 | 20 | from nominatim.db.connection import Connection, connect
|
19 |
| -from nominatim.db.utils import execute_file |
| 21 | +from nominatim.db.utils import execute_file, CopyBuffer |
20 | 22 | from nominatim.db.sql_preprocessor import SQLPreprocessor
|
21 | 23 | from nominatim.version import NOMINATIM_VERSION
|
22 | 24 |
|
@@ -132,21 +134,89 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F
|
132 | 134 | Returns 0 if all was well and 1 if the importance file could not
|
133 | 135 | be found. Throws an exception if there was an error reading the file.
|
134 | 136 | """
|
135 |
| - datafile = data_path / 'wikimedia-importance.sql.gz' |
| 137 | + if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \ |
| 138 | + or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz', |
| 139 | + ignore_errors) == 0: |
| 140 | + return 0 |
136 | 141 |
|
137 |
| - if not datafile.exists(): |
| 142 | + return 1 |
| 143 | + |
| 144 | + |
| 145 | +def import_importance_csv(dsn: str, data_file: Path) -> int: |
| 146 | + """ Replace wikipedia importance table with data from a |
| 147 | + single CSV file. |
| 148 | +
|
| 149 | + The file must be a gzipped CSV and have the following columns: |
| 150 | + language, title, importance, wikidata_id |
| 151 | +
|
| 152 | + Other columns may be present but will be ignored. |
| 153 | + """ |
| 154 | + if not data_file.exists(): |
| 155 | + return 1 |
| 156 | + |
| 157 | + # Only import the first occurance of a wikidata ID. |
| 158 | + # This keeps indexes and table small. |
| 159 | + wd_done = set() |
| 160 | + |
| 161 | + with connect(dsn) as conn: |
| 162 | + with conn.cursor() as cur: |
| 163 | + cur.drop_table('wikipedia_article') |
| 164 | + cur.drop_table('wikipedia_redirect') |
| 165 | + cur.drop_table('wikimedia_importance') |
| 166 | + cur.execute("""CREATE TABLE wikimedia_importance ( |
| 167 | + language TEXT NOT NULL, |
| 168 | + title TEXT NOT NULL, |
| 169 | + importance double precision NOT NULL, |
| 170 | + wikidata TEXT |
| 171 | + ) """) |
| 172 | + |
| 173 | + with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf: |
| 174 | + for row in csv.DictReader(fd, delimiter='\t', quotechar='|'): |
| 175 | + wd_id = int(row['wikidata_id'][1:]) |
| 176 | + buf.add(row['language'], row['title'], row['importance'], |
| 177 | + None if wd_id in wd_done else row['wikidata_id']) |
| 178 | + wd_done.add(wd_id) |
| 179 | + |
| 180 | + if buf.size() > 10000000: |
| 181 | + with conn.cursor() as cur: |
| 182 | + buf.copy_out(cur, 'wikimedia_importance', |
| 183 | + columns=['language', 'title', 'importance', |
| 184 | + 'wikidata']) |
| 185 | + |
| 186 | + with conn.cursor() as cur: |
| 187 | + buf.copy_out(cur, 'wikimedia_importance', |
| 188 | + columns=['language', 'title', 'importance', 'wikidata']) |
| 189 | + |
| 190 | + with conn.cursor() as cur: |
| 191 | + cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title |
| 192 | + ON wikimedia_importance (title)""") |
| 193 | + cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata |
| 194 | + ON wikimedia_importance (wikidata) |
| 195 | + WHERE wikidata is not null""") |
| 196 | + |
| 197 | + conn.commit() |
| 198 | + |
| 199 | + return 0 |
| 200 | + |
| 201 | + |
| 202 | +def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int: |
| 203 | + """ Replace wikipedia importance table with data from an SQL file. |
| 204 | + """ |
| 205 | + if not data_file.exists(): |
138 | 206 | return 1
|
139 | 207 |
|
140 | 208 | pre_code = """BEGIN;
|
141 | 209 | DROP TABLE IF EXISTS "wikipedia_article";
|
142 |
| - DROP TABLE IF EXISTS "wikipedia_redirect" |
| 210 | + DROP TABLE IF EXISTS "wikipedia_redirect"; |
| 211 | + DROP TABLE IF EXISTS "wikipedia_importance"; |
143 | 212 | """
|
144 | 213 | post_code = "COMMIT"
|
145 |
| - execute_file(dsn, datafile, ignore_errors=ignore_errors, |
| 214 | + execute_file(dsn, data_file, ignore_errors=ignore_errors, |
146 | 215 | pre_code=pre_code, post_code=post_code)
|
147 | 216 |
|
148 | 217 | return 0
|
149 | 218 |
|
| 219 | + |
150 | 220 | def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
|
151 | 221 | """ Replaces the secondary importance raster data table with new data.
|
152 | 222 |
|
|
0 commit comments