Skip to content

Commit 3cc3e3b

Browse files
authored
Merge pull request #3321 from lonvia/remove-duplicate-partials
Improvements to query parsing
2 parents dcebea3 + f07f853 commit 3cc3e3b

File tree

3 files changed

+77
-31
lines changed

3 files changed

+77
-31
lines changed

nominatim/api/search/db_search_builder.py

+60-30
Original file line numberDiff line numberDiff line change
@@ -166,15 +166,15 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
166166
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
167167
expected_count = sum(t.count for t in hnrs)
168168

169-
partials = [t for trange in address
170-
for t in self.query.get_partials_list(trange)]
169+
partials = {t.token: t.count for trange in address
170+
for t in self.query.get_partials_list(trange)}
171171

172172
if expected_count < 8000:
173173
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
174-
[t.token for t in partials], lookups.Restrict))
175-
elif len(partials) != 1 or partials[0].count < 10000:
174+
list(partials), lookups.Restrict))
175+
elif len(partials) != 1 or list(partials.values())[0] < 10000:
176176
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
177-
[t.token for t in partials], lookups.LookupAll))
177+
list(partials), lookups.LookupAll))
178178
else:
179179
sdata.lookups.append(
180180
dbf.FieldLookup('nameaddress_vector',
@@ -208,18 +208,17 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
208208
are and tries to find a lookup that optimizes index use.
209209
"""
210210
penalty = 0.0 # extra penalty
211-
name_partials = self.query.get_partials_list(name)
212-
name_tokens = [t.token for t in name_partials]
211+
name_partials = {t.token: t for t in self.query.get_partials_list(name)}
213212

214213
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
215-
addr_tokens = [t.token for t in addr_partials]
214+
addr_tokens = list({t.token for t in addr_partials})
216215

217-
partials_indexed = all(t.is_indexed for t in name_partials) \
216+
partials_indexed = all(t.is_indexed for t in name_partials.values()) \
218217
and all(t.is_indexed for t in addr_partials)
219-
exp_count = min(t.count for t in name_partials) / (2**(len(name_partials) - 1))
218+
exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
220219

221220
if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
222-
yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
221+
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
223222
return
224223

225224
# Partial term to frequent. Try looking up by rare full names first.
@@ -232,22 +231,25 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
232231
addr_tokens = [t.token for t in addr_partials if t.is_indexed]
233232
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
234233
# Any of the full names applies with all of the partials from the address
235-
yield penalty, fulls_count / (2**len(addr_partials)),\
234+
yield penalty, fulls_count / (2**len(addr_tokens)),\
236235
dbf.lookup_by_any_name([t.token for t in name_fulls],
237-
addr_tokens, fulls_count > 10000)
236+
addr_tokens,
237+
fulls_count > 30000 / max(1, len(addr_tokens)))
238238

239239
# To catch remaining results, lookup by name and address
240240
# We only do this if there is a reasonable number of results expected.
241-
exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
242-
if exp_count < 10000 and all(t.is_indexed for t in name_partials):
243-
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
241+
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
242+
if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()):
243+
lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
244244
if addr_tokens:
245245
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
246-
penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
246+
penalty += 0.35 * max(1 if name_fulls else 0.1,
247+
5 - len(name_partials) - len(addr_tokens))
247248
yield penalty, exp_count, lookup
248249

249250

250-
def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
251+
def get_name_ranking(self, trange: TokenRange,
252+
db_field: str = 'name_vector') -> dbf.FieldRanking:
251253
""" Create a ranking expression for a name term in the given range.
252254
"""
253255
name_fulls = self.query.get_tokens(trange, TokenType.WORD)
@@ -256,7 +258,7 @@ def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
256258
# Fallback, sum of penalty for partials
257259
name_partials = self.query.get_partials_list(trange)
258260
default = sum(t.penalty for t in name_partials) + 0.2
259-
return dbf.FieldRanking('name_vector', default, ranks)
261+
return dbf.FieldRanking(db_field, default, ranks)
260262

261263

262264
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
@@ -314,11 +316,9 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
314316
sdata = dbf.SearchData()
315317
sdata.penalty = assignment.penalty
316318
if assignment.country:
317-
tokens = self.query.get_tokens(assignment.country, TokenType.COUNTRY)
318-
if self.details.countries:
319-
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
320-
if not tokens:
321-
return None
319+
tokens = self.get_country_tokens(assignment.country)
320+
if not tokens:
321+
return None
322322
sdata.set_strings('countries', tokens)
323323
elif self.details.countries:
324324
sdata.countries = dbf.WeightedStrings(self.details.countries,
@@ -332,24 +332,54 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
332332
self.query.get_tokens(assignment.postcode,
333333
TokenType.POSTCODE))
334334
if assignment.qualifier:
335-
tokens = self.query.get_tokens(assignment.qualifier, TokenType.QUALIFIER)
336-
if self.details.categories:
337-
tokens = [t for t in tokens if t.get_category() in self.details.categories]
338-
if not tokens:
339-
return None
335+
tokens = self.get_qualifier_tokens(assignment.qualifier)
336+
if not tokens:
337+
return None
340338
sdata.set_qualifiers(tokens)
341339
elif self.details.categories:
342340
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
343341
[0.0] * len(self.details.categories))
344342

345343
if assignment.address:
346-
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
344+
if not assignment.name and assignment.housenumber:
345+
# housenumber search: the first item needs to be handled like
346+
# a name in ranking or penalties are not comparable with
347+
# normal searches.
348+
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
349+
db_field='nameaddress_vector')]
350+
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
351+
else:
352+
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
347353
else:
348354
sdata.rankings = []
349355

350356
return sdata
351357

352358

359+
def get_country_tokens(self, trange: TokenRange) -> List[Token]:
360+
""" Return the list of country tokens for the given range,
361+
optionally filtered by the country list from the details
362+
parameters.
363+
"""
364+
tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
365+
if self.details.countries:
366+
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
367+
368+
return tokens
369+
370+
371+
def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
372+
""" Return the list of qualifier tokens for the given range,
373+
optionally filtered by the qualifier list from the details
374+
parameters.
375+
"""
376+
tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
377+
if self.details.categories:
378+
tokens = [t for t in tokens if t.get_category() in self.details.categories]
379+
380+
return tokens
381+
382+
353383
def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
354384
""" Collect tokens for near items search or use the categories
355385
requested per parameter.

nominatim/api/search/token_assignment.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
132132

133133
# Name tokens are always acceptable and don't change direction
134134
if ttype == qmod.TokenType.PARTIAL:
135+
# qualifiers cannot appear in the middle of the qeury. They need
136+
# to be near the next phrase.
137+
if self.direction == -1 \
138+
and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
139+
return None
135140
return self.direction
136141

137142
# Other tokens may only appear once
@@ -385,7 +390,7 @@ def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
385390
yield from self._get_assignments_address_backward(base, query)
386391

387392
# variant for special housenumber searches
388-
if base.housenumber:
393+
if base.housenumber and not base.qualifier:
389394
yield dataclasses.replace(base, penalty=self.penalty)
390395

391396

test/python/api/search/test_token_assignment.py

+11
Original file line numberDiff line numberDiff line change
@@ -337,3 +337,14 @@ def test_qualifier_after_housenumber():
337337
(BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
338338

339339
check_assignments(yield_token_assignments(q))
340+
341+
342+
def test_qualifier_in_middle_of_phrase():
343+
q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
344+
(BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
345+
(BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]),
346+
(BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]),
347+
(BreakType.PHRASE, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
348+
349+
check_assignments(yield_token_assignments(q))
350+

0 commit comments

Comments
 (0)