7
7
"""
8
8
Main work horse for indexing (computing addresses) the database.
9
9
"""
10
- from typing import cast , List , Any
10
+ from typing import cast , List , Any , Optional
11
11
import logging
12
12
import time
13
13
@@ -83,9 +83,30 @@ async def index_boundaries(self, minrank: int, maxrank: int) -> int:
83
83
LOG .warning ("Starting indexing boundaries using %s threads" ,
84
84
self .num_threads )
85
85
86
+ minrank = max (minrank , 4 )
87
+ maxrank = min (maxrank , 25 )
88
+
89
+ # Precompute number of rows to process for all rows
90
+ with connect (self .dsn ) as conn :
91
+ hstore_info = psycopg .types .TypeInfo .fetch (conn , "hstore" )
92
+ if hstore_info is None :
93
+ raise RuntimeError ('Hstore extension is requested but not installed.' )
94
+ psycopg .types .hstore .register_hstore (hstore_info )
95
+
96
+ with conn .cursor () as cur :
97
+ cur = conn .execute (""" SELECT rank_search, count(*)
98
+ FROM placex
99
+ WHERE rank_search between %s and %s
100
+ AND class = 'boundary' and type = 'administrative'
101
+ AND indexed_status > 0
102
+ GROUP BY rank_search""" ,
103
+ (minrank , maxrank ))
104
+ total_tuples = {row .rank_search : row .count for row in cur }
105
+
86
106
with self .tokenizer .name_analyzer () as analyzer :
87
- for rank in range (max (minrank , 4 ), min (maxrank , 26 )):
88
- total += await self ._index (runners .BoundaryRunner (rank , analyzer ))
107
+ for rank in range (minrank , maxrank + 1 ):
108
+ total += await self ._index (runners .BoundaryRunner (rank , analyzer ),
109
+ total_tuples = total_tuples .get (rank , 0 ))
89
110
90
111
return total
91
112
@@ -101,6 +122,23 @@ async def index_by_rank(self, minrank: int, maxrank: int) -> int:
101
122
LOG .warning ("Starting indexing rank (%i to %i) using %i threads" ,
102
123
minrank , maxrank , self .num_threads )
103
124
125
+ # Precompute number of rows to process for all rows
126
+ with connect (self .dsn ) as conn :
127
+ hstore_info = psycopg .types .TypeInfo .fetch (conn , "hstore" )
128
+ if hstore_info is None :
129
+ raise RuntimeError ('Hstore extension is requested but not installed.' )
130
+ psycopg .types .hstore .register_hstore (hstore_info )
131
+
132
+ with conn .cursor () as cur :
133
+ cur = conn .execute (""" SELECT rank_address, count(*)
134
+ FROM placex
135
+ WHERE rank_address between %s and %s
136
+ AND indexed_status > 0
137
+ GROUP BY rank_address""" ,
138
+ (minrank , maxrank ))
139
+ total_tuples = {row .rank_address : row .count for row in cur }
140
+
141
+
104
142
with self .tokenizer .name_analyzer () as analyzer :
105
143
for rank in range (max (1 , minrank ), maxrank + 1 ):
106
144
if rank >= 30 :
@@ -109,11 +147,12 @@ async def index_by_rank(self, minrank: int, maxrank: int) -> int:
109
147
batch = 5
110
148
else :
111
149
batch = 1
112
- total += await self ._index (runners .RankRunner (rank , analyzer ), batch )
150
+ total += await self ._index (runners .RankRunner (rank , analyzer ),
151
+ batch = batch , total_tuples = total_tuples .get (rank , 0 ))
113
152
114
153
if maxrank == 30 :
115
154
total += await self ._index (runners .RankRunner (0 , analyzer ))
116
- total += await self ._index (runners .InterpolationRunner (analyzer ), 20 )
155
+ total += await self ._index (runners .InterpolationRunner (analyzer ), batch = 20 )
117
156
118
157
return total
119
158
@@ -123,7 +162,7 @@ async def index_postcodes(self) -> int:
123
162
"""
124
163
LOG .warning ("Starting indexing postcodes using %s threads" , self .num_threads )
125
164
126
- return await self ._index (runners .PostcodeRunner (), 20 )
165
+ return await self ._index (runners .PostcodeRunner (), batch = 20 )
127
166
128
167
129
168
def update_status_table (self ) -> None :
@@ -135,14 +174,20 @@ def update_status_table(self) -> None:
135
174
136
175
conn .commit ()
137
176
138
- async def _index (self , runner : runners .Runner , batch : int = 1 ) -> int :
177
+ async def _index (self , runner : runners .Runner , batch : int = 1 ,
178
+ total_tuples : Optional [int ] = None ) -> int :
139
179
""" Index a single rank or table. `runner` describes the SQL to use
140
180
for indexing. `batch` describes the number of objects that
141
- should be processed with a single SQL statement
181
+ should be processed with a single SQL statement.
182
+
183
+ `total_tuples` may contain the total number of rows to process.
184
+ When not supplied, the value will be computed using the
185
+ approriate runner function.
142
186
"""
143
187
LOG .warning ("Starting %s (using batch size %s)" , runner .name (), batch )
144
188
145
- total_tuples = self ._prepare_indexing (runner )
189
+ if total_tuples is None :
190
+ total_tuples = self ._prepare_indexing (runner )
146
191
147
192
progress = ProgressLogger (runner .name (), total_tuples )
148
193
0 commit comments