diff --git a/conflator/conflator.py b/conflator/conflator.py index 8647b7c..46b0a8c 100755 --- a/conflator/conflator.py +++ b/conflator/conflator.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# Copyright (c) 2021, 2022, 2023 Humanitarian OpenStreetMap Team +# Copyright (c) 2021, 2022, 2023, 2024 Humanitarian OpenStreetMap Team # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -21,23 +21,29 @@ import os from sys import argv from osm_fieldwork.osmfile import OsmFile -from geojson import Point, Feature, FeatureCollection, dump, Polygon +from geojson import Point, Feature, FeatureCollection, dump, Polygon, load import geojson -import psycopg2 from shapely.geometry import shape, Polygon, mapping import shapely from shapely import wkt -import xmltodict from progress.bar import Bar, PixelBar from progress.spinner import PixelSpinner from osm_fieldwork.convert import escape +from osm_fieldwork.parsers import ODKParsers from osm_rawdata.postgres import PostgresClient, uriParser +from geosupport import GeoSupport +# from conflator.geosupport import GeoSupport from codetiming import Timer import concurrent.futures from cpuinfo import get_cpu_info from time import sleep from haversine import haversine, Unit from thefuzz import fuzz, process +from pathlib import Path +from osm_fieldwork.parsers import ODKParsers +from pathlib import Path +from spellchecker import SpellChecker +# from deepdiff import DeepDiff # Instantiate logger log = logging.getLogger(__name__) @@ -47,90 +53,64 @@ cores = info['count'] +# A function that returns the 'year' value: +def distSort(data: list): + """ + Args: + data (list): The data to sort + """ + return data['dist'] + class Conflator(object): def __init__(self, - source: str, + uri: str, boundary: str = None, ): """ Initialize Input data source Args: - source (str): The source URI + source (str): The source URI or file + uri (str): URI for the primary database boundary: str = None Returns: - (OdkMerge): An instance of this object + (Conflator): An instance of this object """ self.postgres = list() - self.source = source self.tags = dict() + self.boundary = None # Distance in meters for conflating with postgis self.tolerance = 7 self.data = dict() - self.analyze = ("name", "amenity", "landuse", "cuisine", "tourism", "leisure") - # PG: is the same prefix as ogr2ogr - # "[user[:password]@][netloc][:port][/dbname]" - if source[0:3] == "PG:": - uri = uriParser(source[3:]) - # self.source = "underpass" is not support yet - # Each thread needs it's own connection to postgres to avoid problems. - for thread in range(0, cores + 1): - db = PostgresClient(dbhost=uri['dbhost'], dbname=uri['dbname'], dbuser=uri['dbuser'], dbpass=uri['dbpass']) - self.postgres.append(db) - if boundary: - self.clip(boundary, db) - else: - log.info("Opening data file: %s" % source) - src = open(source, "r") - self.data = geojson.load(src) + self.analyze = ("building", "name", "amenity", "landuse", "cuisine", "tourism", "leisure") + # uri = uriParser(source[3:]) + # self.source = "underpass" is not support yet + # Each thread needs it's own connection to postgres to avoid problems. + for thread in range(0, cores + 1): + db = GeoSupport(uri) + self.postgres.append(db) if boundary: - self.clip(boundary) - - def clip(self, - boundary: Polygon, - db: PostgresClient, - ): - """ - Clip a data source by a boundary - - Args: - boundary (str): The filespec of the project AOI - db (PostgresClient): A reference to the existing database connection - - Returns: - (bool): If the region was clipped sucessfully - """ - remove = list() - if not boundary: - return False - - if 'features' in boundary: - poly = geom['features'][0]['geometry'] - else: - poly = geom["geometry"] - ewkt = shape(poly) - - if len(self.data) > 0: - # TODO: FMTM produces data extracts the exact size of the boundary - # polygon, so we don't need to clip it. In the future though we - # want this to produce a subset from a larger file. - for feature in self.data['features']: - entry = shapely.from_geojson(str(feature)) - # if not shapely.contains(ewkt, entry): - # log.debug(f"CONTAINS {entry}") - # del self.data[self.data['features']] - pass - else: - # setup the postgres VIEWs with a dummy SQL query - sql = f"SELECT COUNT(osm_id) FROM nodes" - result = db.queryLocal(sql, ewkt) - return True + self.boundary = boundary + db.clipDB(boundary) + + # log.info("Opening data file: %s" % source) + # toplevel = Path(source) + # if toplevel.suffix == ".geosjon": + # src = open(source, "r") + # self.data = geojson.load(src) + # elif toplevel.suffix == ".osm": + # src = open(source, "r") + # osmin = OsmFile() + # self.data = osmin.loadFile(source) # input file + # if boundary: + # gs = GeoSupport(source) + # # self.data = gs.clipFile(self.data) def makeNewFeature(self, attrs: dict = None, tags: dict = None, - ): + ) -> dict: """ Create a new feature with optional data @@ -152,227 +132,262 @@ def makeNewFeature(self, newf['tags'] = dict() return newf - def conflateFile(self, - feature: dict, - ): + def checkTags(self, + feature: Feature, + osm: dict, + ): """ - Conflate a POI against all the features in a GeoJson file + Check tags between 2 features. Args: - feature (dict): The feature to conflate + feature (Feature): The feature from the external dataset + osm (dict): The result of the SQL query Returns: - (dict): The modified feature + (int): The number of tag matches + (dict): The updated tags """ - # Most smartphone GPS are 5-10m off most of the time, plus sometimes - # we're standing in front of an amenity and recording that location - # instead of in the building. - gps_accuracy = 10 - # this is the treshold for fuzzy string matching + tags = osm['tags'] + hits = 0 match_threshold = 80 - # log.debug(f"conflateFile({feature})") - hits = False - data = dict() - geom = Point((float(feature["attrs"]["lon"]), float(feature["attrs"]["lat"]))) - wkt = shape(geom) - for existing in self.data['features']: - id = int(existing['properties']['id']) - entry = shapely.from_geojson(str(existing)) - if entry.geom_type != 'Point': - center = shapely.centroid(entry) - else: - center = entry - # dist = shapely.hausdorff_distance(center, wkt) - # if 'name' in existing['properties']: - # print(f"DIST1: {dist}, {existing['properties']['name']}") - # x = shapely.distance(wkt, entry) - # haversine reverses the order of lat & lon from what shapely uses. We - # use this as meters is easier to deal with than cartesian coordinates. - x1 = (center.coords[0][1], center.coords[0][0]) - x2 = (wkt.coords[0][1], wkt.coords[0][0]) - dist = haversine(x1, x2, unit=Unit.METERS) - if dist < gps_accuracy: - # if 'name' in existing['properties']: - # log.debug(f"DIST2: {dist}") - # log.debug(f"Got a Hit! {feature['tags']['name']}") - for key,value in feature['tags'].items(): - if key in self.analyze: - if key in existing['properties']: - result = fuzz.ratio(value, existing['properties'][key]) - if result > match_threshold: - # log.debug(f"Matched: {result}: {feature['tags']['name']}") - existing['properties']['fixme'] = "Probably a duplicate!" - log.debug(f"Got a dup in file!!! {existing['properties']['name'] }") - hits = True - break - if hits: - version = int(existing['properties']['version']) - # coords = feature['geometry']['coordinates'] - # lat = coords[1] - # lon = coords[0] - attrs = {'id': id, 'version': version, 'lat': feature['attrs']['lat'], 'lon': feature['attrs']['lon']} - tags = existing['properties'] - tags['fixme'] = "Probably a duplicate!" - # Data extracts for ODK Collect - del tags['title'] - del tags['label'] - if 'building' in tags: - return {'attrs': attrs, 'tags': tags, 'refs': list()} - return {'attrs': attrs, 'tags': tags} - return dict() - - def conflateWay(self, - feature: dict, - dbindex: int, + if osm['tags']['dist'] > float(self.tolerance): + return 0, osm['tags'] + for key, value in feature['tags'].items(): + if key in tags: + ratio = fuzz.ratio(value, tags[key]) + if ratio > match_threshold: + hits += 1 + else: + if key != 'note': + tags[f'old_{key}'] = value + tags[key] = value + + return hits, tags + + def conflateFiles(self, + odkspec: str, + osmspec: str, + threshold: int = 10, ): """ - Conflate a POI against all the ways in a postgres view + Open the two source files and contlate them. Args: - feature (dict): The feature to conflate - dbindex (int): An index into the array of postgres connections + odkspec (dict): The features from ODK to conflate + osmspec (dict): The existing OSM data + threshold (int): Threshold for distance calculations Returns: - (dict): The modified feature + (dict): The conflated output """ - # log.debug(f"conflateWay({feature})") - hits = False - result = list() - geom = Point((float(feature["attrs"]["lon"]), float(feature["attrs"]["lat"]))) - wkt = shape(geom) - for key, value in feature['tags'].items(): - if key in self.analyze: - # Sometimes the duplicate is a polygon, really common for parking lots. - cleanval = escape(value) - query = f"SELECT osm_id,tags,version,ST_AsText(ST_Centroid(geom)) FROM ways_view WHERE ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')) < {self.tolerance} AND levenshtein(tags->>'{key}', '{cleanval}') <= 1" - # log.debug(query) - self.postgres[dbindex].dbcursor.execute(query) - try: - result = self.postgres[dbindex].dbcursor.fetchall() - except: - result = list() - # log.warning(f"No results at all for {query}") - if len(result) > 0: - hits = True - break - if hits: - log.debug(f"Got a dup in ways!!! {feature['tags']['name']}") - # the result is a list from what we specify for SELECT - version = int(result[0][2]) + 1 - attrs = {'id': int(result[0][0]), 'version': version} - tags = result[0][1] - tags[f'old_{key}'] = value - tags['fixme'] = "Probably a duplicate!" - geom = mapping(shapely.from_wkt(result[0][3])) - refs = list() - # FIXME: iterate through the points and find the existing nodes, - # which I'm not sure - # is possible - # SELECT osm_id,tags,version FROM nodes WHERE ST_Contains(geom, ST_GeomFromText('Point(-105.9918636 38.5360821)')); - # for i in geom['coordinates'][0]: - # print(f"XXXXX: {i}") - return {'attrs': attrs, 'tags': tags, 'refs': refs} - return dict() - - def conflateNode(self, - feature: dict, - dbindex: int, - ): + odkdata = list() + osmdata = list() + + # The collected data from ODK + odkpath = Path(odkspec) + if odkpath.suffix == '.geojson': + log.debug(f"Parsing GeoJson files {odkspec}") + odkfile = open(odkspec, 'r') + features = geojson.load(odkfile) + odkdata = features['feature'] + elif odkpath.suffix == '.osm': + log.debug(f"Parsing OSM XML files {odkspec}") + osmfile = OsmFile() + odkdata = osmfile.loadFile(odkspec) + elif odkpath.suffix == ".csv": + log.debug(f"Parsing csv files {odkspec}") + odk = ODKParsers() + for entry in odk.CSVparser(odkspec): + odkdata.append(odk.createEntry(entry)) + elif odkpath.suffix == ".json": + log.debug(f"Parsing json files {odkspec}") + odk = ODKParsers() + for entry in odk.JSONparser(odkspec): + odkdata.append(odk.createEntry(entry)) + + # The data extract from OSM + osmpath = Path(osmspec) + if osmpath.suffix == '.geojson': + osmfile = open(osmspec, 'r') + features = geojson.load(osmfile) + osmdata = features['features'] + if osmpath.suffix == '.osm': + osmfile = OsmFile() + osmdata = osmfile.loadFile(osmspec) + + return self.conflateFeatures(odkdata, osmdata, threshold) + + def conflateFeatures(self, + odkdata: list, + osmdata: list, + threshold: int = 1, + spellcheck: bool = True, + ): """ - Conflate a POI against all the nodes in the view + Conflate features from ODK against all the features in OSM. Args: - feature (dict): The feature to conflate - dbindex (int): An index into the array of postgres connections + odkdata (list): The features from ODK to conflate + osmdata (list): The existing OSM data + threshold (int): Threshold for distance calculations + spellcheck (bool): Whether to also spell check string values Returns: - (dict): The modified feature - """ - # log.debug(f"conflateNode({feature})") - hits = False - geom = Point((float(feature["attrs"]["lon"]), float(feature["attrs"]["lat"]))) - wkt = shape(geom) - result = list() - ratio = 1 - for key,value in feature['tags'].items(): - if key in self.analyze: - # print("%s = %s" % (key, value)) - # Use a Geography data type to get the answer in meters, which - # is easier to deal with than degress of the earth. - cleanval = escape(value) - query = f"SELECT osm_id,tags,version,ST_AsEWKT(geom) FROM nodes_view WHERE ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')) < {self.tolerance} AND levenshtein(tags->>'{key}', '{cleanval}') <= {ratio}" - # print(query) - # FIXME: this currently only works with a local database, not underpass yet - self.postgres[dbindex].dbcursor.execute(query) - try: - result = self.postgres[dbindex].dbcursor.fetchall() - except: - result = list() - # log.warning(f"No results at all for {query}") - if len(result) > 0: - hits = True - break - if hits: - log.debug(f"Got a dup in nodes!!! {feature['tags']}") - version = int(result[0][2]) + 1 - coords = shapely.from_wkt(result[0][3][10:]) - lat = coords.y - lon = coords.x - attrs = {'id': int(result[0][0]), 'version': version, 'lat': lat, 'lon': lon} - tags = result[0][1] - tags[f'old_{key}'] = value - tags['fixme'] = "Probably a duplicate!" - return {'attrs': attrs, 'tags': tags} - return dict() - - def conflateById(self, - feature: dict, - dbindex: int, - ): + (list): The conflated output """ - Conflate a feature with existing ways using the OSM ID + timer = Timer(text="conflateFeatures() took {seconds:.0f}s") + timer.start() - Args: - feature (dict): The feature to conflate - dbindex (int): An index into the array of postgres connections + # ODK data is always a single node when mapping buildings, but the + # OSM data will be a mix of nodes and ways. For the OSM data, the + # building centroid is used. - Returns: - (dict): The modified feature - """ - log.debug(f"conflateById({feature})") - id = int(feature['attrs']['id']) - if id > 0: - if self.source[:3] != "PG:": - sql = f"SELECT osm_id,tags,version,ST_AsText(geom) FROM ways_view WHERE tags->>'id'='{id}'" - # log.debug(sql) - self.postgres[0].dbcursor.execute(sql) - result = self.postgres[0].dbcursor.fetchone() - if result: - version = int(result[0][2]) + 1 - attrs = {'id': int(result[0][0]), 'version': version} - tags = result[0][1] - # tags[f'old_{key}'] = value - tags['fixme'] = "Probably a duplicate!" - geom = mapping(shapely.from_wkt(result[0][3])) - return {'attrs': attrs, 'tags': tags} + # Most smartphone GPS are 5-10m off most of the time, plus sometimes + # we're standing in front of an amenity and recording that location + # instead of in the building. + # gps_accuracy = 10 + # this is the treshold for fuzzy string matching + match_threshold = 80 + data = list() + # New features not in OSM always use negative IDs + odkid = -100 + osmid = 0 + nodes = dict() + version = 0 + for entry in odkdata: + confidence = 0 + maybe = list() + odktags = dict() + osmtags = dict() + feature = dict() + newtags = dict() + geom = None + if 'attrs' in entry: + # The source came from an OSM XML file + geom = Point((float(entry["attrs"]["lon"]), float(entry["attrs"]["lat"]))) + odktags = entry['tags'] + elif 'coordinates' in entry: + # The source came from a GeoJson file + gps = entry['coordinates'] + geom = Point(float(gps[0]), float(gps[1])) + odktags = entry['properties'] + wkt = shape(geom) + for existing in osmdata: + # We could probably do this using GeoPandas or gdal, but that's + # going to do the same brute force thing anyway. + if 'geometry' in existing: + geom = existing['geometry'] + osmwkt = shape(geom) + if osmwkt.geom_type != 'Point': + center = shapely.centroid(osmwkt) else: - sql = f"SELECT osm_id,tags,version,ST_AsText(geom) FROM ways_view WHERE tags->>'id'='{id}'" - # log.debug(sql) - self.postgres[dbindex].dbcursor.execute(sql) - result = self.postgres[dbindex].dbcursor.fetchone() - if result: - version = int(result[0][2]) + 1 - attrs = {'id': int(result[0][0]), 'version': version} - tags = result[0][1] - # tags[f'old_{key}'] = value - tags['fixme'] = "Probably a duplicate!" - geom = mapping(shapely.from_wkt(result[0][3])) - return {'attrs': attrs, 'tags': tags, 'refs': refs} - else: - for key, value in self.data.items(): - if key == id: - return value - return dict() + center = shape(osmwkt) + # dist = shapely.hausdorff_distance(center, wkt) + dist = wkt.distance(center) + if dist < threshold: + # cache all OSM features within our threshold distance + # These are needed by ODK, but duplicates of other fields, + # so they aren't needed and just add more clutter. + maybe.append({"dist": dist, "odk": entry, "osm": existing}) + + # Compare tags for everything that got cached + hits = 0 + if len(maybe) > 0: + # cache the refs to use in the OSM XML output file + refs = list() + odk = dict() + osm = dict() + # After sorting, the first entry is the closet feature + maybe.sort(key=distSort) + # make consistent data structures from different input formats + if 'properties' in maybe[0]["odk"]: + odk['tags'] = maybe[0]["odk"]['properties'] + gps = maybe[0]['geometry'] + odk['attrs']= {'id': odkid, 'lat': gps[0], 'lon': gps[1]} + odkversion = odk['properties']['version'] + if 'title' in odk: + del odk['title'] + if 'label' in odk: + del odk['label'] + elif 'attrs' in maybe[0]["odk"]: + odk['tags'] = maybe[0]["odk"]['tags'] + odk['attrs'] = maybe[0]["odk"]['attrs'] + + if 'properties' in maybe[0]["osm"]: + osm['tags'] = maybe[0]["osm"]['properties'] + if 'title' in osm['tags']: + del osm['tags']['title'] + if 'label' in osm['tags']: + del osm['tags']['label'] + gps = maybe[0]['osm']['geometry']['coordinates'] + osm['attrs']= {'id': osm['tags']['id'], 'lat': gps[0], 'lon': gps[1]} + elif 'attrs' in maybe[0]["osm"]: + osm['tags'] = maybe[0]["osm"]['tags'] + osm['attrs'] = maybe[0]["osm"]['attrs'] + version = int(osm['attrs']['version']) + 1 + if 'refs' in maybe[0]['osm']: + refs = eval(maybe[0]['osm']['refs']) + nodes[osm['attrs']['id']] = osm + + for key, value in odk['tags'].items(): + # log.debug(f"Comparing: {value} == {value}") + if key[:4] == "name": + if 'tags' not in osm: + breakpoint() + # log.debug(f"Comparing: {value} == {osm['tags'][key]}") + if key in osm['tags']: + if key not in osm['tags']: + continue + result = fuzz.ratio(value, osm['tags'][key]) + if result > match_threshold: + log.debug(f"Matched: {result}: {key} = {value}") + + log.debug(f"Got a dup in file!!! {odktags}") + hits += 1 + confidence = result + # FIXME: if 100%, perfect match, less than + # that probably contains a spelling mistake. + else: + if odk['tags'] == osm['tags']: + # this would be an exact match in tags between odk and osm. + # unlikely though. + hits += 1 + else: + # diff = DeepDiff(osm['tags'], odk['tags']) + # see if the ODK key exists in the OSM tags + if key in osm['tags']: + hits += 1 + + if hits > 0: + # log.debug(f"HITS: {hits}") + # If there have been hits, it's probably a duplicate + attrs = {"id": osm['attrs']["id"], "version": version, 'lat': osm['attrs']['lat'], 'lon': osm['attrs']['lon']} + newtags = odktags | osmtags + # These are added by ODK Collect, and not relevant for OSM + # del newtags['id'] + if "refs" in newtags: + del newtags['refs'] + # if "properties" in existing: + # attrs["id"] = existing["properties"]["id"] + # else: + # attrs["id"] = existing["attrs"]["id"] + newtags['fixme'] = "Probably a duplicate!" + newtags['confidence'] = hits + if len(refs) == 0: + feature = {"attrs": attrs, "version": version, "tags": newtags} + else: + feature = {"attrs": attrs, "version": version, "refs": refs, "tags": newtags} + # data.append(feature) + + # If no hits, it's new data. ODK data is always just a POI for now + feature["attrs"] = {"id": odkid, "lat": entry["attrs"]["lat"], "lon": entry["attrs"]["lon"], "version": version, "timestamp": entry["attrs"]["timestamp"]} + feature["tags"] = odktags + # print(f"{odkid}: {odktags}") + odkid -= 1 + data.append(feature) + + timer.stop() + return data def cleanFeature(self, feature: dict, @@ -405,27 +420,41 @@ def dump(self): # print(f"{k}(v{self.versions[k]}) = {v}") def conflateData(self, - odkdata: list, - ): + source: str, + ) -> dict: """ Conflate all the data. This the primary interfacte for conflation. Args: - odkdata (list): A list of all the entries in the OSM XML input file + source (str): The source file to conflate Returns: - (dict): The modified features + (dict): The conflated features """ timer = Timer(text="conflateData() took {seconds:.0f}s") timer.start() + + log.info("Opening data file: %s" % source) + toplevel = Path(source) + if toplevel.suffix == ".geosjon": + src = open(source, "r") + self.data = geojson.load(src) + elif toplevel.suffix == ".osm": + src = open(source, "r") + osmin = OsmFile() + self.data = osmin.loadFile(source) # input file + if self.boundary: + gs = GeoSupport(source) + # self.data = gs.clipFile(self.data) + # Use fuzzy string matching to handle minor issues in the name column, # which is often used to match an amenity. if len(self.data) == 0: - self.postgres[0].dbcursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch") - log.debug(f"OdkMerge::conflateData() called! {len(odkdata)} features") + self.postgres[0].query("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch") + # log.debug(f"OdkMerge::conflateData() called! {len(odkdata)} features") # A chunk is a group of threads - chunk = round(len(odkdata) / cores) + chunk = round(len(self.data) / cores) # cycle = range(0, len(odkdata), chunk) @@ -434,11 +463,15 @@ def conflateData(self, future = None result = None index = 0 + if True: # DEBUGGING HACK ALERT! + result = conflateThread(self.data, self, index) + return dict() + with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor: i = 0 subset = dict() futures = list() - for key, value in odkdata.items(): + for key, value in self.data.items(): subset[key] = value if i == chunk: i = 0 @@ -457,16 +490,27 @@ def conflateData(self, return newdata # return alldata - def outputOSM(self, - data: FeatureCollection, - ): - """ - Output in OSM XML format + def writeOSM(self, + data: dict, + filespec: str, + ): + osm = OsmFile(filespec) + for entry in data: + out = str() + if 'refs' in entry: + if len(entry['refs']) > 0: + out = osm.createWay(entry, True) + else: + out = osm.createNode(entry, True) + if len(out) > 0: + osm.write(out) - Args: - data (FeatureCollection): The data to convert - """ - pass + def writeGeoJson(self, + data: dict, + filespec: str, + ): + for entry in data: + pass def conflateThread(features: dict, source: str, @@ -479,8 +523,8 @@ def conflateThread(features: dict, feature (dict): The feature to conflate source (str): The data source for conflation, file or database dbindex (int): An index into the array of postgres connections - Returns: + (list): the conflated data output """ timer = Timer(text="conflateThread() took {seconds:.0f}s") timer.start() @@ -504,12 +548,9 @@ def conflateThread(features: dict, # using geopoint in the XLSForm. result = source.conflateById(value, dbindex) elif id < 0: - if source.source[:3] != "PG:": - result = source.conflateFile(value) - else: - result = source.conflateNode(value, dbindex) - if len(result) == 0: - result = source.conflateWay(value, dbindex) + result = source.conflateNode(value, dbindex) + if len(result) == 0: + result = source.conflateWay(value, dbindex) if result and len(result) > 0: # Merge the tags and attributes together, the OSM data and ODK data. # If no match is found, the ODK data is used to create a new feature. @@ -534,25 +575,31 @@ def main(): parser = argparse.ArgumentParser( prog="conflator", formatter_class=argparse.RawDescriptionHelpFormatter, - description="This program conflates external data with existing features from OSM.", + description="This program conflates external data with existing features in OSM.", epilog=""" - This program conflates external datasets with OSM data using a postgresql database. + This program conflates external datasets with OSM data. It can use a postgres +database, or a GeoJson and OSM XML files as the input sources. + + Examples: + To conflate two files + ./conflator.py -v -s camping-2024_06_14.osm -e extract.geojson + + To conflate a file using postgres + ./conflator.py -v -s camping-2024_06_14.geojson -u localhost/usa -b utah.geojson + """, ) parser.add_argument("-v", "--verbose", action="store_true", help="verbose output") + parser.add_argument("-u", "--uri", help="OSM Database URI") + parser.add_argument("-e", "--extract", help="The OSM data extract") + parser.add_argument("-s", "--source", required=True, help="The ODK data to conflate") + parser.add_argument("-t", "--threshold", default=1, help="Threshold for distance calculations") parser.add_argument("-o", "--outfile", help="Output file from the conflation") - parser.add_argument("-i", "--infile", help="GeoJson or OSM XML file to conflate") - parser.add_argument("-b", "--boundary", help="Boundary polygon to limit the data size") + parser.add_argument("-b", "--boundary", help="Optional boundary polygon to limit the data size") - args, unknown = parser.parse_known_args() - osmdata = None + args = parser.parse_args() + indata = None source = None - if len(unknown) < 2: - parser.print_help() - quit() - else: - osmdata = unknown[0] - source = unknown[1] # if verbose, dump to the terminal. if args.verbose: @@ -565,50 +612,30 @@ def main(): ch.setFormatter(formatter) log.addHandler(ch) + if not args.extract and not args.uri: + parser.print_help() + log.error("You must supply a database URI or a data extract file!") + quit() + + outfile = None if args.outfile: outfile = args.outfile else: - outfile = os.path.basename(osmdata.replace('.osm', '-foo.osm')) + toplevel = Path(args.source) - # This is the existing OSM data, a database or a file - if args.boundary: - extract = OdkMerge(source, args.boundary) - else: - extract = OdkMerge(source) + conflate = Conflator(args.uri) - if extract: - odkf = OsmFile(outfile) # output file - osm = odkf.loadFile(osmdata) # input file - #odkf.dump() - else: - log.error("No ODK data source specified!") - parser.print_help() - quit() + if args.extract is not None and len(args.extract) > 0: + data = conflate.conflateFiles(args.source, args.extract, int(args.threshold)) - # This returns a list of lists of dictionaries. Each thread returns - # a list of the features, and len(data) is thre number of CPU cores. - data = extract.conflateData(osm) - out = list() - #print(data) - for entry in data: - # if 'refs' in feature or 'building' in feature['tags']: - for feature in entry: - if 'refs' in feature: - feature['refs'] = list() - out.append(odkf.createWay(feature, True)) - else: - out.append(odkf.createNode(feature, True)) - - # out = "" - # for id, feature in osm.items(): - # result = extract.conflateFile(feature) - # if len(result) > 0: - # node = odkf.featureToNode(result) - # else: - # node = feature - # out += odkf.createNode(node, True) - odkf.write(out) - log.info(f"Wrote {outfile}") + jsonout = f"{toplevel.stem}-out.geojson" + osmout = f"{toplevel.stem}-out.osm" + + conflate.writeOSM(data, osmout) + conflate.writeGeoJson(data, jsonout) + + log.info(f"Wrote {osmout}") + log.info(f"Wrote {jsonout}") if __name__ == "__main__": """This is just a hook so this file can be run standlone during development.""" diff --git a/docs/odkconflation.md b/docs/odkconflation.md new file mode 100644 index 0000000..96df751 --- /dev/null +++ b/docs/odkconflation.md @@ -0,0 +1,236 @@ +# Conflating OpenDataKit with OpenStreetMap + +Typically conflation is done when doing data imports, but not +always. Data collected in the field can be considered an +import. Conflating buildings or POIs from external data is relatively +easy as it's already been cleaned up and validated. When you are doing +field mapping, then you have to cleanup and validate the data during +conflation. This is a time consuming process even with good conflation +software. + +I've worked with multiple conflation software over the +years. [Hootenanny](https://github.com/ngageoint/hootenanny), +[OpenJump](http://www.openjump.org/) (later forked into RoadMatcher), +etc... which currently are now dead projects. Conflation is a hard +technical challenge and often the results are poor and +unstatisfing result. For smalller datasets often it's easier to do do +manual conflation using [JOSM](https://josm.openstreetmap.de/) or +[Qgis](https://qgis.org/en/site/). This project tries to simply the +problem by focusing on OpenStreetMap data. + +## Smartphone Data Collection + +While commercial organizations may use expensive GPS devices, most of +us that do data collection as a volunteer or for an NGO use their +smartphone. Their is a variety of smartphone apps for data collection +that fall ihnto two categories. The first category are the apps like +[Vespucci](http://vespucci.io/), +[StreetComplete](https://streetcomplete.app/), and [Organic +Maps](https://organicmaps.app/). These directly upload to +[OpenStreetMap](https://www.openstreetmap.org). These are great for +the casual mapper who only adds data occasionally and is limited to a +POI. For example, a casual mapper may want to add the restaurant they +are currrently eating in when they notices it's not in +OpenStreetMap. In addition, they probably have a cell phone +connection, so the data gets added right away. + +The other category are apps like [ODK Collect](https://getodk.org/), +[QField](https://qfield.org/) [ArcGIS Field +Maps](https://www.arcgis.com/apps/fieldmaps/) which are oriented to +larger scale mapping projects, often offline without any cellular +connection. These collect a lot of data that then needs to get +processed later. And conflation is part of this process. + +All of these smartphone based data collection apps suffer from poor +GPS location accuracy. Modern smartphones (2024) are often 5-9 meters +off the actual location, sometimes worse. In addition when field data +collecting, you can't always record the actual location you want, you +can only record where you are standing. + +You can improve the location data somewhat if you have a good quality +basemap, for example you see a building within a courthouse wall when +you are standing in the street. If you have a basemap, typically +satellite imagery, you can touch the location on the basemap, and use +that instead of where you are standing. Then later when conflating, +you have a much higher chance the process will be less painful. + +## OpenDataKit + +[OpenDataKit](https://opendatakit.org/software/) is a +format for data import forms used to collect custom data. The source +file is a spreadsheet, called an +[XLSForm](https://xlsform.org/en/). This gets used by the mobile app +for the quesion and answer process defined by the XLSForm. There are +multiple apps and projects using XLSForms, so it's well supported and +maintained. + +The XLS source file syntax is a bit wierd at first, being a +spreadsheet, so the osm-fieldwork project contains tested XLSForm +templates for a variety of mapping project goals. These can be used to +create efficient XForms that are easy to convert to OSM. The primary +task when manually converting ODK collected data into OSM format is +converting the tags. If the XLSForm is created with a focus towards +OSM the XLSForm can make this a much simpler process. This [is +detailed](https://www.senecass.com/projects/Mapping/tech/ImproveXLSForms.html) +more in this document. Simply stated, what is in the *name* colum in +the XLSForm becomes the *name* of the tag in OSM, and the response +from the choices sheet becomes the value. + +### ODK Collect & Central + +[ODK Collect](https://getodk.org/) is a mobile app for data collection +using XLSForms. It's server side is [ODK +Central](https://docs.getodk.org/central-intro/), which replaces the +older [ODK Aggregate](https://docs.getodk.org/aggregate-intro/). ODK +Central manages the XLSForms downloaded to your phone, as wall as the +submissions uploaded from your phone when back online. + +A related project for processing ODK data and working remotely with +Central is [osm-fieldwork](https://hotosm.github.io/osm-fieldwork/). +This Python project handles conversion of the various data files from +Collect or Central, into OSM XML and GeoJson for future processing via +editing or conflation. This is heavily used in the FMTM backend. + +## Field Data Collection + +Collecting data in the field is to best way to add data to +OpenStreetMap. Whether done by casual mappers adding POIs, to more +dedicated mappers, what is reality at that moment is the key to +keeping OSM fresh and updated. When it comes to improving the metadata +for buildings, many have been imported with **building=yes** from remote +mapping using the [HOT Tasking Manager](https://tasks.hotosm.org/) to +trace buildings from satellite imagery. + +But ground-truthing what kind of building it is improvers the map. It +may be a medical clinic, restaurant, residence, etc.. who know until +somebody stands in front of the building to collect more informsation +about it. This may be idenifying it as a clinic or reseidence, adding +the building material, what is the roof made of, is it's power +non-existance, or are there solar panels or a generator ? Some +humanitarian mapping is collecting data on public toilets, and +community water sources for future improvements. + +Knowing there is a building on the map is useful, but better yet is +what is the building used for ? What is it made of ? Does it have AC +or DC power ? Water available ? All of these details improve the map +to make it more useful to others. + +### Field Mapping Camping Manager + +The [Field Mapping Camping Manager](fmtm.hotosm.org) (FMTM) is a +project to oprganize large scale data collection using ODK Collect and +ODK Central. It uses the osm-fieldwork project for much of the backend +processing of the ODK data, but is designed for large scale field +mapping involving many people. It uses ODK Collect and ODK Central as +the primary tools. One of the final steps in processing ODK data to +import into OSM is conflating it with existing data. This can be done +manually of course, but with a large number of data submissions this +becomes tedious and time consuming. FMTM aggrgates all the data for an +entire project, and may have thousands of submissions. This is where +conflation is critical. + +# The Algorythm + +Currently conflation is focused on ODK with OSM. This uses the +conflator.py program which can conflate between the ODK data and an +OSM data extract. There are other conflation programs in this project +for other external datasets, but uses a postgres database instead of +two files. + +## The Conflator() Class + +This is the primary interface for conflating files. It has two primary +endpoint. This top level endpoint is **Conflator.conflateFiles()**, +which is used when the conflator program is run standalone. It opens +the two disk files, parses the various formats, and generates a data +structure used for conflation. This class uses the **Parsers()** class +from osm-fieldwork that can parse the JSON or CSV files downloaded +from ODK Central, or the ODK XML "instance" files when working +offline. OPSM XML or GeoJson files are also supported. Each entry in +the files is turned into list of python dicts to make it easier to +compaert the data. + +Once the two files are read, the **Conflator.conflateFeatures()** +endpoint takes the two lists of data and does the actual +conflation. There is an additional parameter passed to this endpoint +that is the threshold distance. This is used to find all features in +the OSM data extract within that distance. Note that this is a unit of +the earth's circumforance, not meters, so distance calulations are a +bit fuzzy. + +This is a brute force conflation algorythm, not fast but it tries to +be complete. it is comprised of two loops. The top level loops through +the ODK data. For each ODK data entry, it finds all the OSM features +within that threshold distance. The inner loop then uses the closest +feature and compares the tags. This is where things get +interesting.... If there is a *name* tag in the ODK data, this is +string compared with the name in the closest OSM feature. Fuzzy string +matching is used to handle minor spelling differences. Sometimes the +mis-spelling is in the OSM data, but often when entering names of +features on your smartphone, mis-typing occurs. If there is a 100% +match in the name tags, then chances are the feature exists in OSM +already. + +If there is no *name* tag in the ODK data, then the other tags are +compared to try to find a possible duplicate feature. For example, a +public toilet at a trailhead has no name, but if both ODK and OSM have +**amenity=toilet**, then it's very likey a duplicate. If no tags +match, then the ODK data is proably a new feature. + +Any time a possible duplicate is found, it is not automatically +merged. Instead a **fixme** tag is added to the feature in the output +file with a statement that it is potentially a duplicate. When the +output file is loaded into JOSM, you can search for this tag to +manually decide if it is a duplicate. + +## XLSForm Design + +Part of the key detail to improve conflation requires a carefully +created XLSForm. There is much more detailed information on +[XLSForm +design](https://hotosm.github.io/osm-fieldwork/about/xlsforms/), but +briefly whatever is in the *name* column in the *survey* sheet becomes +the name of the tags, and whatever is in the *name* column in the +*choices* sheet becomes the value. If you want a relatively smooth +conflation, make sure your XLSForm uses OSM tagging schemas. + +If you don't follow OSM tagging, then conflation will assumme all your +ODK data is a new feature, and you'll have to manually conflate the +results using JOSM. That's OK for small datasets, but quickly becomes +very tedious for the larger datasets that FMTM collects. + +## The Output File + +The output file must be in OSM XML to enable updating the ways. If the +OSM data is a POI, viewing it in JOSM is easy. If the OSM data is a +polygon, when loaded into JOSM, they won't appear at first. Since the +OSM way created by conflation has preserved the *refs* used by OSM XML +to reference the nodes, doing *update modified* in JOSM then pulls +down the nodes and all the polygons will appear. + +## Conflicts + +There are some interesting issues to fix post conflation. ODK data is +usually a single POI, whereas in OSM it may be a polygon. Sometimes +though the POI is already in OSM. Remote mapping or building footprint +imports often have a polygon with a single **building=yes** tag. If +the POI we collected in ODK has more data, for example this building is +a restaurant serving pizza, and is made of brick. + +In OSM sometimes there is a POI for an amenity, as well as a building +polygon that were added at different times by different people. The +key detail for conflation is do any of the tags and values from the +new data match existing data ? + +FMTM downloads a data extract from OSM using +[osm-rawdata](https://hotosm.github.io/osm-rawdata/), and then +filters the data extract based on what is on the choices +sheet of the XLSForm. Otherwise Collect won't launch. Because this +data extract does not contain all the tags that are in OSM, it creates +conflicts. This problem is FMTM specific, and can be improved by +making more complete data extract from OSM. + +When the only tag in the OSM data is **building=**, any tags from ODK +are merged with the building polygon when possible. If the OSM feature +has other tags, JOSM will flag this as a conflict. Then you have to +manually merge the tags in JOSM. diff --git a/mkdocs.yml b/mkdocs.yml index 8b2e15a..b2db0d4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -65,7 +65,7 @@ plugins: nav: - Home: index.md - About: about.md - - Conflation Guide: conflation.md + - Conflation Guide: odkconflation.md - Utilities: - Conflator: conflator.md - Utilities: utilities.md