Skip to content

Commit ace84ed

Browse files
committed
use address counts for improving index lookup
1 parent ff3230a commit ace84ed

File tree

2 files changed

+65
-19
lines changed

2 files changed

+65
-19
lines changed

nominatim/api/search/db_search_builder.py

+59-14
Original file line numberDiff line numberDiff line change
@@ -227,28 +227,73 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
227227
if name_fulls:
228228
fulls_count = sum(t.count for t in name_fulls)
229229
if len(name_partials) == 1:
230-
penalty += min(1, max(0, (exp_count - 50 * fulls_count) / (1000 * fulls_count)))
231-
# At this point drop unindexed partials from the address.
232-
# This might yield wrong results, nothing we can do about that.
233-
if not partials_indexed:
234-
addr_tokens = [t.token for t in addr_partials if t.is_indexed]
230+
penalty += min(0.5, max(0, (exp_count - 50 * fulls_count) / (2000 * fulls_count)))
231+
if partials_indexed:
235232
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
236-
# Any of the full names applies with all of the partials from the address
237-
yield penalty, fulls_count / (2**len(addr_tokens)),\
238-
dbf.lookup_by_any_name([t.token for t in name_fulls],
239-
addr_tokens,
240-
fulls_count > 30000 / max(1, len(addr_tokens)))
233+
234+
yield penalty,fulls_count / (2**len(addr_tokens)), \
235+
self.get_full_name_ranking(name_fulls, addr_partials,
236+
fulls_count > 30000 / max(1, len(addr_tokens)))
241237

242238
# To catch remaining results, lookup by name and address
243239
# We only do this if there is a reasonable number of results expected.
244240
exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
245241
if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()):
246-
lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
247-
if addr_tokens:
248-
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
249242
penalty += 0.35 * max(1 if name_fulls else 0.1,
250243
5 - len(name_partials) - len(addr_tokens))
251-
yield penalty, exp_count, lookup
244+
yield penalty, exp_count,\
245+
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
246+
247+
248+
def get_name_address_ranking(self, name_tokens: List[int],
249+
addr_partials: List[Token]) -> List[dbf.FieldLookup]:
250+
""" Create a ranking expression looking up by name and address.
251+
"""
252+
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
253+
254+
addr_restrict_tokens = []
255+
addr_lookup_tokens = []
256+
for t in addr_partials:
257+
if t.is_indexed:
258+
if t.addr_count > 20000:
259+
addr_restrict_tokens.append(t.token)
260+
else:
261+
addr_lookup_tokens.append(t.token)
262+
263+
if addr_restrict_tokens:
264+
lookup.append(dbf.FieldLookup('nameaddress_vector',
265+
addr_restrict_tokens, lookups.Restrict))
266+
if addr_lookup_tokens:
267+
lookup.append(dbf.FieldLookup('nameaddress_vector',
268+
addr_lookup_tokens, lookups.LookupAll))
269+
270+
return lookup
271+
272+
273+
def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
274+
use_lookup: bool) -> List[dbf.FieldLookup]:
275+
""" Create a ranking expression with full name terms and
276+
additional address lookup. When 'use_lookup' is true, then
277+
address lookups will use the index, when the occurences are not
278+
too many.
279+
"""
280+
# At this point drop unindexed partials from the address.
281+
# This might yield wrong results, nothing we can do about that.
282+
if use_lookup:
283+
addr_restrict_tokens = []
284+
addr_lookup_tokens = []
285+
for t in addr_partials:
286+
if t.is_indexed:
287+
if t.addr_count > 20000:
288+
addr_restrict_tokens.append(t.token)
289+
else:
290+
addr_lookup_tokens.append(t.token)
291+
else:
292+
addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
293+
addr_lookup_tokens = []
294+
295+
return dbf.lookup_by_any_name([t.token for t in name_fulls],
296+
addr_restrict_tokens, addr_lookup_tokens)
252297

253298

254299
def get_name_ranking(self, trange: TokenRange,

nominatim/api/search/db_search_fields.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -231,16 +231,17 @@ def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[Fiel
231231
return lookup
232232

233233

234-
def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int],
235-
use_index_for_addr: bool) -> List[FieldLookup]:
234+
def lookup_by_any_name(name_tokens: List[int], addr_restrict_tokens: List[int],
235+
addr_lookup_tokens: List[int]) -> List[FieldLookup]:
236236
""" Create a lookup list where name tokens are looked up via index
237237
and only one of the name tokens must be present.
238238
Potential address tokens are used to restrict the search further.
239239
"""
240240
lookup = [FieldLookup('name_vector', name_tokens, lookups.LookupAny)]
241-
if addr_tokens:
242-
lookup.append(FieldLookup('nameaddress_vector', addr_tokens,
243-
lookups.LookupAll if use_index_for_addr else lookups.Restrict))
241+
if addr_restrict_tokens:
242+
lookup.append(FieldLookup('nameaddress_vector', addr_restrict_tokens, lookups.Restrict))
243+
if addr_lookup_tokens:
244+
lookup.append(FieldLookup('nameaddress_vector', addr_lookup_tokens, lookups.LookupAll))
244245

245246
return lookup
246247

0 commit comments

Comments
 (0)