@@ -166,15 +166,15 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
166
166
sdata .lookups = [dbf .FieldLookup ('name_vector' , [t .token for t in hnrs ], lookups .LookupAny )]
167
167
expected_count = sum (t .count for t in hnrs )
168
168
169
- partials = [ t for trange in address
170
- for t in self .query .get_partials_list (trange )]
169
+ partials = { t . token : t . count for trange in address
170
+ for t in self .query .get_partials_list (trange )}
171
171
172
172
if expected_count < 8000 :
173
173
sdata .lookups .append (dbf .FieldLookup ('nameaddress_vector' ,
174
- [ t . token for t in partials ] , lookups .Restrict ))
175
- elif len (partials ) != 1 or partials [0 ]. count < 10000 :
174
+ list ( partials ) , lookups .Restrict ))
175
+ elif len (partials ) != 1 or list ( partials . values ()) [0 ] < 10000 :
176
176
sdata .lookups .append (dbf .FieldLookup ('nameaddress_vector' ,
177
- [ t . token for t in partials ] , lookups .LookupAll ))
177
+ list ( partials ) , lookups .LookupAll ))
178
178
else :
179
179
sdata .lookups .append (
180
180
dbf .FieldLookup ('nameaddress_vector' ,
@@ -208,18 +208,17 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
208
208
are and tries to find a lookup that optimizes index use.
209
209
"""
210
210
penalty = 0.0 # extra penalty
211
- name_partials = self .query .get_partials_list (name )
212
- name_tokens = [t .token for t in name_partials ]
211
+ name_partials = {t .token : t for t in self .query .get_partials_list (name )}
213
212
214
213
addr_partials = [t for r in address for t in self .query .get_partials_list (r )]
215
- addr_tokens = [ t .token for t in addr_partials ]
214
+ addr_tokens = list ({ t .token for t in addr_partials })
216
215
217
- partials_indexed = all (t .is_indexed for t in name_partials ) \
216
+ partials_indexed = all (t .is_indexed for t in name_partials . values () ) \
218
217
and all (t .is_indexed for t in addr_partials )
219
- exp_count = min (t .count for t in name_partials ) / (2 ** (len (name_partials ) - 1 ))
218
+ exp_count = min (t .count for t in name_partials . values () ) / (2 ** (len (name_partials ) - 1 ))
220
219
221
220
if (len (name_partials ) > 3 or exp_count < 8000 ) and partials_indexed :
222
- yield penalty , exp_count , dbf .lookup_by_names (name_tokens , addr_tokens )
221
+ yield penalty , exp_count , dbf .lookup_by_names (list ( name_partials . keys ()) , addr_tokens )
223
222
return
224
223
225
224
# Partial term to frequent. Try looking up by rare full names first.
@@ -232,22 +231,25 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
232
231
addr_tokens = [t .token for t in addr_partials if t .is_indexed ]
233
232
penalty += 1.2 * sum (t .penalty for t in addr_partials if not t .is_indexed )
234
233
# Any of the full names applies with all of the partials from the address
235
- yield penalty , fulls_count / (2 ** len (addr_partials )),\
234
+ yield penalty , fulls_count / (2 ** len (addr_tokens )),\
236
235
dbf .lookup_by_any_name ([t .token for t in name_fulls ],
237
- addr_tokens , fulls_count > 10000 )
236
+ addr_tokens ,
237
+ fulls_count > 30000 / max (1 , len (addr_tokens )))
238
238
239
239
# To catch remaining results, lookup by name and address
240
240
# We only do this if there is a reasonable number of results expected.
241
- exp_count = exp_count / (2 ** len (addr_partials )) if addr_partials else exp_count
242
- if exp_count < 10000 and all (t .is_indexed for t in name_partials ):
243
- lookup = [dbf .FieldLookup ('name_vector' , name_tokens , lookups .LookupAll )]
241
+ exp_count = exp_count / (2 ** len (addr_tokens )) if addr_tokens else exp_count
242
+ if exp_count < 10000 and all (t .is_indexed for t in name_partials . values () ):
243
+ lookup = [dbf .FieldLookup ('name_vector' , list ( name_partials . keys ()) , lookups .LookupAll )]
244
244
if addr_tokens :
245
245
lookup .append (dbf .FieldLookup ('nameaddress_vector' , addr_tokens , lookups .LookupAll ))
246
- penalty += 0.35 * max (0 , 5 - len (name_partials ) - len (addr_tokens ))
246
+ penalty += 0.35 * max (1 if name_fulls else 0.1 ,
247
+ 5 - len (name_partials ) - len (addr_tokens ))
247
248
yield penalty , exp_count , lookup
248
249
249
250
250
- def get_name_ranking (self , trange : TokenRange ) -> dbf .FieldRanking :
251
+ def get_name_ranking (self , trange : TokenRange ,
252
+ db_field : str = 'name_vector' ) -> dbf .FieldRanking :
251
253
""" Create a ranking expression for a name term in the given range.
252
254
"""
253
255
name_fulls = self .query .get_tokens (trange , TokenType .WORD )
@@ -256,7 +258,7 @@ def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
256
258
# Fallback, sum of penalty for partials
257
259
name_partials = self .query .get_partials_list (trange )
258
260
default = sum (t .penalty for t in name_partials ) + 0.2
259
- return dbf .FieldRanking ('name_vector' , default , ranks )
261
+ return dbf .FieldRanking (db_field , default , ranks )
260
262
261
263
262
264
def get_addr_ranking (self , trange : TokenRange ) -> dbf .FieldRanking :
@@ -314,11 +316,9 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
314
316
sdata = dbf .SearchData ()
315
317
sdata .penalty = assignment .penalty
316
318
if assignment .country :
317
- tokens = self .query .get_tokens (assignment .country , TokenType .COUNTRY )
318
- if self .details .countries :
319
- tokens = [t for t in tokens if t .lookup_word in self .details .countries ]
320
- if not tokens :
321
- return None
319
+ tokens = self .get_country_tokens (assignment .country )
320
+ if not tokens :
321
+ return None
322
322
sdata .set_strings ('countries' , tokens )
323
323
elif self .details .countries :
324
324
sdata .countries = dbf .WeightedStrings (self .details .countries ,
@@ -332,24 +332,54 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
332
332
self .query .get_tokens (assignment .postcode ,
333
333
TokenType .POSTCODE ))
334
334
if assignment .qualifier :
335
- tokens = self .query .get_tokens (assignment .qualifier , TokenType .QUALIFIER )
336
- if self .details .categories :
337
- tokens = [t for t in tokens if t .get_category () in self .details .categories ]
338
- if not tokens :
339
- return None
335
+ tokens = self .get_qualifier_tokens (assignment .qualifier )
336
+ if not tokens :
337
+ return None
340
338
sdata .set_qualifiers (tokens )
341
339
elif self .details .categories :
342
340
sdata .qualifiers = dbf .WeightedCategories (self .details .categories ,
343
341
[0.0 ] * len (self .details .categories ))
344
342
345
343
if assignment .address :
346
- sdata .set_ranking ([self .get_addr_ranking (r ) for r in assignment .address ])
344
+ if not assignment .name and assignment .housenumber :
345
+ # housenumber search: the first item needs to be handled like
346
+ # a name in ranking or penalties are not comparable with
347
+ # normal searches.
348
+ sdata .set_ranking ([self .get_name_ranking (assignment .address [0 ],
349
+ db_field = 'nameaddress_vector' )]
350
+ + [self .get_addr_ranking (r ) for r in assignment .address [1 :]])
351
+ else :
352
+ sdata .set_ranking ([self .get_addr_ranking (r ) for r in assignment .address ])
347
353
else :
348
354
sdata .rankings = []
349
355
350
356
return sdata
351
357
352
358
359
+ def get_country_tokens (self , trange : TokenRange ) -> List [Token ]:
360
+ """ Return the list of country tokens for the given range,
361
+ optionally filtered by the country list from the details
362
+ parameters.
363
+ """
364
+ tokens = self .query .get_tokens (trange , TokenType .COUNTRY )
365
+ if self .details .countries :
366
+ tokens = [t for t in tokens if t .lookup_word in self .details .countries ]
367
+
368
+ return tokens
369
+
370
+
371
+ def get_qualifier_tokens (self , trange : TokenRange ) -> List [Token ]:
372
+ """ Return the list of qualifier tokens for the given range,
373
+ optionally filtered by the qualifier list from the details
374
+ parameters.
375
+ """
376
+ tokens = self .query .get_tokens (trange , TokenType .QUALIFIER )
377
+ if self .details .categories :
378
+ tokens = [t for t in tokens if t .get_category () in self .details .categories ]
379
+
380
+ return tokens
381
+
382
+
353
383
def get_near_items (self , assignment : TokenAssignment ) -> Optional [dbf .WeightedCategories ]:
354
384
""" Collect tokens for near items search or use the categories
355
385
requested per parameter.
0 commit comments