Skip to content

Commit

Permalink
Fix duplicates algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
Zverik committed Mar 20, 2018
1 parent 87c6ce1 commit 284f952
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 12 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

## master branch

## 1.3.1

_Released 2018-03-20_

* "Similar tags" now means at least 66% instead of 50%.
* Instead of removing all duplicates, conflating them and removing only unmatched.

## 1.3.0

_Released 2018-03-15_
Expand Down
48 changes: 37 additions & 11 deletions conflate/conflate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import sys
from io import BytesIO
from collections import defaultdict
try:
from .version import __version__
except ImportError:
Expand Down Expand Up @@ -41,6 +42,7 @@ def __init__(self, pid, lat, lon, tags=None, category=None, remarks=None):
self.category = category
self.dist_offset = 0
self.remarks = remarks
self.exclusive_group = None

def distance(self, other):
"""Calculate distance in meters."""
Expand Down Expand Up @@ -774,7 +776,7 @@ def search_nn_fix(kd, point):
self.register_match(dist[0][1], osm_point.id)
osm_kd = osm_kd.remove(osm_point)
del dist[0]
for i in range(len(dist)-1, -1, -1):
for i in reversed(range(len(dist))):
if dist[i][2] == osm_point:
nearest, distance = search_nn_fix(osm_kd, self.dataset[dist[i][1]])
if nearest and distance <= max_distance:
Expand Down Expand Up @@ -811,9 +813,34 @@ def match(self):
if count_created > 0:
logging.info('Created %s audit-overridden dataset points', count_created)

# Prepare exclusive groups dict
exclusive_groups = defaultdict(set)
for p, v in self.dataset.items():
if v.exclusive_group is not None:
exclusive_groups[v.exclusive_group].add(p)

# Then find matches for unmatched dataset points
self.match_dataset_points_smart()

# Remove unmatched duplicates
count_duplicates = 0
for ids in exclusive_groups.values():
found = False
for p in ids:
if p not in self.dataset:
found = True
break
for p in ids:
if p in self.dataset:
if found:
count_duplicates += 1
del self.dataset[p]
else:
# Leave one element when not matched any
found = True
if count_duplicates > 0:
logging.info('Removed %s unmatched duplicates', count_duplicates)

# Add unmatched dataset points
logging.info('Adding %s unmatched dataset points', len(self.dataset))
for k in sorted(list(self.dataset.keys())):
Expand Down Expand Up @@ -1054,31 +1081,30 @@ def check_dataset_for_duplicates(profile, dataset, print_all=False):
diff_tags = [k for k in tags if tags[k] == '---']
kd = kdtree.create(list(dataset))
duplicates = set()
group = 0
for d in dataset:
if d.id in duplicates:
continue
group += 1
for alt, _ in kd.search_knn(d, 3): # The first one will be equal to d
if alt.data.id != d.id and alt.data.distance(d) < max_distance:
dist = alt.data.distance(d)
if alt.data.id != d.id and dist < max_distance:
tags_differ = 0
if alt.data.distance(d) > uncond_distance:
if dist > uncond_distance:
for k in diff_tags:
if alt.data.tags.get(k) != d.tags.get(k):
tags_differ += 1
if tags_differ <= len(diff_tags) / 2:
if tags_differ <= max(1, len(diff_tags) / 3):
duplicates.add(alt.data.id)
d.exclusive_group = group
alt.data.exclusive_group = group
if print_all or len(duplicates) <= 5:
is_duplicate = tags_differ <= 1
logging.error('Dataset points %s: %s and %s',
'duplicate each other' if is_duplicate else 'are too similar',
d.id, alt.data.id)
if duplicates:
remove = profile.get('remove_duplicates', True)
if remove:
for i in reversed(range(len(dataset))):
if dataset[i].id in duplicates:
del dataset[i]
logging.error('%s %s duplicates from the dataset',
'Removed' if remove else 'Found', len(duplicates))
logging.error('Found %s duplicates in the dataset', len(duplicates))
if found_duplicate_ids:
raise KeyError('Cannot continue with duplicate ids')

Expand Down
2 changes: 1 addition & 1 deletion conflate/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.3.0'
__version__ = '1.3.1'

0 comments on commit 284f952

Please sign in to comment.