diff --git a/CHANGELOG.md b/CHANGELOG.md index 58d2af9..9d10b2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ ## master branch +## 1.3.2 + +_Released 2018-04-19_ + +* Fixed bug in categories building. +* Fixed threshold for tags in duplicates check. +* Now the script prints "Done" when finished, to better measure time. + ## 1.3.1 _Released 2018-03-20_ diff --git a/conflate/conflate.py b/conflate/conflate.py index ddf7f84..b516eda 100755 --- a/conflate/conflate.py +++ b/conflate/conflate.py @@ -247,8 +247,15 @@ def construct_overpass_query(self, bboxes=None): for t in ('node', 'way', 'relation["type"="multipolygon"]'): query += t + tag_str + bbox_str + ';' if self.ref is not None: - for t in ('node', 'way', 'relation'): - query += t + '["' + self.ref + '"];' + if not self.profile.get('bounded_update', False): + for t in ('node', 'way', 'relation'): + query += t + '["' + self.ref + '"];' + else: + for bbox in bboxes: + bbox_str = '' if bbox is None else '(' + ','.join( + [str(x) for x in bbox]) + ')' + for t in ('node', 'way', 'relation'): + query += t + '["' + self.ref + '"]' + bbox_str + ';' query += '); out meta qt center;' return query @@ -368,30 +375,28 @@ def get_categories(self, tags): def match_query(tags, query): for tag in query: if len(tag) == 1: - if tag[0] in tags: + return tag[0] in tags + else: + value = tags.get(tag[0], None) + if tag[1] is None or tag[1] == '': + return value is None + if value is None: return False - elif tag[1] is None or tag[1] == '': - if tag[0] not in tags: - return False - else: - value = tags.get(tag[0], None) - if value is None: - return False - found = False - for t2 in tag[1:]: - if t2[0] == '~': - m = re.search(t2[1:], value) - if not m: - return False - elif t2[0] == '!': - if t2[1:].lower() in value.lower(): - found = True - elif t2 == value: + found = False + for t2 in tag[1:]: + if t2[0] == '~': + m = re.search(t2[1:], value) + if not m: + return False + elif t2[0] == '!': + if t2[1:].lower() in value.lower(): found = True - if found: - break - if not found: - return False + elif t2 == value: + found = True + if found: + break + if not found: + return False return True def tags_to_query(tags): @@ -1094,7 +1099,7 @@ def check_dataset_for_duplicates(profile, dataset, print_all=False): for k in diff_tags: if alt.data.tags.get(k) != d.tags.get(k): tags_differ += 1 - if tags_differ <= max(1, len(diff_tags) / 3): + if tags_differ <= len(diff_tags) / 3: duplicates.add(alt.data.id) d.exclusive_group = group alt.data.exclusive_group = group @@ -1255,6 +1260,8 @@ def run(profile=None): fc = {'type': 'FeatureCollection', 'features': conflator.changes} json.dump(fc, options.changes, ensure_ascii=False, sort_keys=True, indent=1) + logging.info('Done') + if __name__ == '__main__': run() diff --git a/conflate/version.py b/conflate/version.py index 72837bd..e398332 100644 --- a/conflate/version.py +++ b/conflate/version.py @@ -1 +1 @@ -__version__ = '1.3.1' +__version__ = '1.3.2' diff --git a/profiles/moscow_addr.py b/profiles/moscow_addr.py new file mode 100644 index 0000000..4f7a7de --- /dev/null +++ b/profiles/moscow_addr.py @@ -0,0 +1,131 @@ +import json +import logging + +source = 'dit.mos.ru' +no_dataset_id = True +query = [[('addr:housenumber',)], [('building',)]] +max_distance = 50 +max_request_boxes = 2 +master_tags = ('addr:housenumber', 'addr:street') + +COMPLEX = False +ADMS = { + '1': 'Северо-Западный административный округ', + '2': 'Северный административный округ', + '3': 'Северо-Восточный административный округ', + '4': 'Западный административный округ', + '5': 'Центральный административный округ', + '6': 'Восточный административный округ', + '7': 'Юго-Западный административный округ', + '8': 'Южный административный округ', + '9': 'Юго-Восточный административный округ', + '10': 'Зеленоградский административный округ', + '11': 'Троицкий административный округ', + '12': 'Новомосковский административный округ', +} +ADM = ADMS['2'] +if param: + if param[0] == 'c': + COMPLEX = True + param = param[1:] + if param in ADMS: + ADM = ADMS[param] + + +def dataset(fileobj): + def find_center(geodata): + if not geodata: + return None + if 'center' in geodata: + return geodata['center'][0] + if 'coordinates' in geodata: + typ = geodata['type'] + lonlat = [0, 0] + cnt = 0 + if typ == 'Polygon': + for p in geodata['coordinates'][0]: + lonlat[0] += p[0] + lonlat[1] += p[1] + cnt += 1 + elif typ == 'LineString': + for p in geodata['coordinates']: + lonlat[0] += p[0] + lonlat[1] += p[1] + cnt += 1 + elif typ == 'Point': + p = geodata['coordinates'] + lonlat[0] += p[0] + lonlat[1] += p[1] + cnt += 1 + if cnt > 0: + return [lonlat[0]/cnt, lonlat[1]/cnt] + return None + + global COMPLEX, ADM + import zipfile + zf = zipfile.ZipFile(fileobj) + data = [] + no_geodata = 0 + no_addr = 0 + count = 0 + for zname in zf.namelist(): + source = json.loads(zf.read(zname).decode('cp1251')) + for el in source: + gid = el['global_id'] + try: + adm_area = el['ADM_AREA'] + if adm_area != ADM: + continue + count += 1 + lonlat = find_center(el.get('geoData')) + if not lonlat: + no_geodata += 1 + street = el.get('P7') + house = el.get('L1_VALUE') + htype = el.get('L1_TYPE') + corpus = el.get('L2_VALUE') + ctype = el.get('L2_TYPE') + stroenie = el.get('L3_VALUE') + stype = el.get('L3_TYPE') + if not street or not house: + no_addr += 1 + continue + if not lonlat: + continue + is_complex = False + housenumber = house.replace(' ', '') + if htype != 'дом': + is_complex = True + if htype in ('владение', 'домовладение'): + housenumber = 'вл' + housenumber + else: + logging.warn('Unknown house number type: %s', htype) + continue + if corpus: + if ctype == 'корпус': + housenumber += ' к{}'.format(corpus) + else: + logging.warn('Unknown corpus type: %s', ctype) + continue + if stroenie: + is_complex = True + if stype == 'строение' or stype == 'сооружение': + housenumber += ' с{}'.format(stroenie) + else: + logging.warn('Unknown stroenie type: %s', stype) + continue + if is_complex != COMPLEX: + continue + tags = { + 'addr:street': street, + 'addr:housenumber': housenumber, + } + data.append(SourcePoint(gid, lonlat[1], lonlat[0], tags)) + except Exception as e: + logging.warning('PROFILE: Failed to get attributes for address %s: %s', gid, str(e)) + logging.warning(json.dumps(el, ensure_ascii=False)) + + if no_addr + no_geodata > 0: + logging.warning('%.2f%% of data have no centers, and %.2f%% have no streets or house numbers', + 100*no_geodata/count, 100*no_addr/count) + return data