From 4fbb9c3ba52471b69ba1487f5200ee4434c3d87a Mon Sep 17 00:00:00 2001 From: Rob Savoye Date: Mon, 19 Aug 2024 13:31:28 -0600 Subject: [PATCH 1/5] fix: Add program to clean up the name and reference tags in OSM data to fit the standard --- utilities/fixnames.py | 272 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100755 utilities/fixnames.py diff --git a/utilities/fixnames.py b/utilities/fixnames.py new file mode 100755 index 0000000..c088840 --- /dev/null +++ b/utilities/fixnames.py @@ -0,0 +1,272 @@ +#!/usr/bin/python3 + +# Copyright (c) 2021, 2022, 2023, 2024 Humanitarian OpenStreetMap Team +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import argparse +import logging +import sys +import os +import re +from sys import argv +from osm_fieldwork.osmfile import OsmFile +from geojson import Point, Feature, FeatureCollection, dump, Polygon, load +import geojson +from shapely.geometry import shape, LineString, Polygon, mapping +import shapely +from shapely.ops import transform +from shapely import wkt +from progress.bar import Bar, PixelBar +from progress.spinner import PixelSpinner +from osm_fieldwork.osmfile import OsmFile +from osm_fieldwork.parsers import ODKParsers +import pyproj +import asyncio +from codetiming import Timer +import concurrent.futures +from cpuinfo import get_cpu_info +from time import sleep +from haversine import haversine, Unit +from thefuzz import fuzz, process +from pathlib import Path +from osm_fieldwork.parsers import ODKParsers +from pathlib import Path +from spellchecker import SpellChecker +from osm_rawdata.pgasync import PostgresClient +from tqdm import tqdm +import tqdm.asyncio +import xmltodict +import math +from threading import Thread + +# Instantiate logger +log = logging.getLogger(__name__) + +# The number of threads is based on the CPU cores +info = get_cpu_info() +# Try doubling the number of cores, since the CPU load is +# still reasonable. +cores = info['count'] + + +async def main(): + """This main function lets this class be run standalone by a bash script""" + parser = argparse.ArgumentParser( + prog="fix names", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("-v", "--verbose", action="store_true", help="verbose output") + parser.add_argument("-o", "--outfile", default="out.geojson", help="Output file from the conflation") + parser.add_argument("-i", "--infile", required=True, help="Input file to fix") + + args = parser.parse_args() + + # if verbose, dump to the terminal. + if args.verbose: + log.setLevel(logging.DEBUG) + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter( + "%(threadName)10s - %(name)s - %(levelname)s - %(message)s" + ) + ch.setFormatter(formatter) + log.addHandler(ch) + + path = Path(args.infile) + if path.suffix == ".geojson": + file = open(args.infile, "r") + indata = geojson.load(file) + data = indata["features"] + file.close() + elif path.suffix == ".osm": + osm = OsmFile() + indata = osm.loadFile(args.infile) + data = indata + else: + log.error("Unsupport file type!") + + features = list() + for feature in data: + tags = {"name": None, "ref:usfs": None, "ref": None} + matched = False + name = None + ref = None + if "properties" in feature: + if "name" in feature["properties"]: + name = feature["properties"]["name"] + else: + continue + if "ref" in feature["properties"]: + ref = feature["properties"]["ref"] + elif "tags" in feature: + if "name" in feature["tags"]: + name = feature["tags"]["name"] + if "ref" in feature["tags"]: + ref = feature["tags"]["ref"] + + if ref is not None: + if ref.find(';') > 0: + tmp = ref.split(';') + log.debug(f"REF: {ref}") + tags["ref"] = tmp[0] + tags["ref:usfs"] = tmp[1] + + if name is None: + continue + + # log.debug(f"NAME: {name}") + + # if name == "415.3A": + # breakpoint() + ref = "[0-9]+[.a-z]" + pat = re.compile(ref) + if pat.match(name.lower()): + # log.debug(f"MATCHED: {pat.pattern}") + tags["ref:usfs"] = f"FR {name.title()}" + matched = True + + pat = re.compile(f"fire road") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[2].title()}" + matched = True + + pat = re.compile(f"county road") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref"] = f"CR {tmp[2].title()}" + matched = True + + pat = re.compile(f"fs road") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[2].title()}" + matched = True + + pat = re.compile(f"fr road") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[2].title()}" + matched = True + + pat = re.compile(f"usfs road") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[2].title()}" + matched = True + + pat = re.compile(f".*forest service road") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + if len(tmp) == 3: + tags["ref:usfs"] = f"FR {tmp[2].title()}" + elif len(tmp) == 4: + tags["ref:usfs"] = f"FR {tmp[3].title()}" + matched = True + + pat = re.compile(f"fr ") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tags["ref:usfs"] = f"FR {tmp[1].title()}" + matched = True + + pat = re.compile(f"fs ") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[1].title()}" + matched = True + + pat = re.compile(f"usfs trail ") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[2].title()}" + matched = True + + pat = re.compile(f".*fsr{ref}") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[2].title()}" + + if matched: + for key, value in tags.items(): + if value is not None: + if "properties" in feature: + del feature["properties"]["name"] + feature["properties"][key] = value + elif "tags" in feature: + if "name" in feature["tags"]: + del feature["tags"]["name"] + feature["tags"][key] = value + features.append(feature) + else: + features.append(feature) + + # log.debug(f"\t{tags}") + + if path.suffix == ".geojson": + outdata = FeatureCollection(features) + file = open(args.outfile, "w") + geojson.dump(outdata, file) + file.close() + log.info(f"Wrote {args.outfile}") + else: + path = Path(args.outfile) + outosm = OsmFile(f"{path.stem}.osm") + out = list() + for entry in features: + if "tiger:cfcc" in entry["tags"]: + del entry["tags"]["tiger:cfcc"] + if "tiger:county" in entry["tags"]: + del entry["tags"]["tiger:county"] + if "tiger:name_base" in entry["tags"]: + del entry["tags"]["tiger:name_base"] + if "tiger:name_base_1" in entry["tags"]: + del entry["tags"]["tiger:name_base_1"] + if "tiger:name_type" in entry["tags"]: + del entry["tags"]["tiger:name_type"] + if "tiger:name_type_1" in entry["tags"]: + del entry["tags"]["tiger:name_type_1"] + if "tiger:reviewed" in entry["tags"]: + del entry["tags"]["tiger:reviewed"] + if "tiger:tlid" in entry["tags"]: + del entry["tags"]["tiger:tlid"] + if "tiger:source" in entry["tags"]: + del entry["tags"]["tiger:source"] + if "tiger:separated" in entry["tags"]: + del entry["tags"]["tiger:separated"] + if "tiger:upload_uuid" in entry["tags"]: + del entry["tags"]["tiger:upload_uuid"] + if "refs" in entry: + out.append(osm.createWay(entry, True)) + else: + out.append(osm.createNode(entry, True)) + outosm.write(out) + log.info(f"Wrote {path.stem}.osm") + + +if __name__ == "__main__": + """This is just a hook so this file can be run standlone during development.""" + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(main()) From c215c5c7a0220121dc262335d72932b88f75a291 Mon Sep 17 00:00:00 2001 From: Rob Savoye Date: Mon, 19 Aug 2024 13:42:35 -0600 Subject: [PATCH 2/5] fix: Add section on the new fixname.py program --- docs/utilities.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/utilities.md b/docs/utilities.md index 4b2240b..1b055dc 100644 --- a/docs/utilities.md +++ b/docs/utilities.md @@ -56,3 +56,16 @@ Map](https://prd-tnm.s3.amazonaws.com/index.html?prefix=StagedProducts/TopoMapVe use the Shapefiles, as the different categories are in separate files inside the zip. Each one covers a 7.5 quad square on a topo map. These have to be merged together into a single file to be practical. + +## fixnames.py + +On the OSM wiki, there is a list of [incorrect +tagging](https://wiki.openstreetmap.org/wiki/United_States_roads_tagging#National_Forest_Road_System) +for forest highway names. Basically the name shouldn't be something +like *"Forest Service Road 123.4A"*. That's actually a reference +number, not a name. This is primarily a problem with existing OSM +data. These would all have to get manually fixed when validating in +JOSM, so this program automates the process so you only have to +validate, and not edit the feature. After conflation, process the +output file with this utility to produce an improved version. + From 2cd6a43484dbf5f17b5858ff9a021bd6c0d614ff Mon Sep 17 00:00:00 2001 From: Rob Savoye Date: Mon, 19 Aug 2024 14:34:48 -0600 Subject: [PATCH 3/5] fix: Add section on Forest road naming and the TODO plugin --- docs/highways.md | 93 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 87 insertions(+), 6 deletions(-) diff --git a/docs/highways.md b/docs/highways.md index 16e001b..f289ca5 100644 --- a/docs/highways.md +++ b/docs/highways.md @@ -36,7 +36,8 @@ Although most of the fields in these datasets aren't useful for OSM, some are like is it a seasonal road, various off road vehicle access permissions, etc... since this is also useful for navigation. Any tags added or edited will follow the [OSM Tagging -Guidelines](https://wiki.openstreetmap.org/wiki/United_States_roads_tagging#Tagging_Forest_Roads) +Guidelines](https://wiki.openstreetmap.org/wiki/United_States_roads_tagging#Tagging_Forest_Roads)lh + for forest roads. # The Datasets @@ -58,6 +59,8 @@ geometries. Conflation then is mostly working through the name and reference fields between multiple files, which sometimes don't agree on the proper name. +And OpenStreetMap of course. + ## Processing The Datasets Since the files are very large with different schemas, a critical @@ -112,11 +115,73 @@ clipped properly at the boundary. These task boundary polygons can then be used to create the project in the Tasking Manager, which will furthur split that into the size you want for mapping. +### The OpenStreetMap Extract + +This step is unnecessary if you plan to manually conflate with a +GeoJson file, so jump ahead to the next section. + +To conflate against OSM data with the goal of automatically merging +the tags into the feature you have to prepare the dataset. Each +feature needs to be validated anyway, merging tags is more efficient +than cut & paste. Since this project is processing data from multiple +US states, it exceeds the [Overpass](https://overpass-turbo.eu/) data +size. + +I download the states I want to conflate from +[Geofabrik](http://download.geofabrik.de/north-america.html), and then +use [osmium +merge](https://docs.osmcode.org/osmium/latest/osmium-merge.html) to +turn it into one big file. I have to do this because most of the +national forest cross state lines. You'll get duplicate ID errors if +you download these files on different days, so grab all the ones you +plan to merge at the same time. Geofabrik updates every 24 hours. When +dealing with files too large for JOSM or QGIS, +[osmium](https://osmcode.org/osmium-tool/) is the tool to use. There +is also [osmfilter](https://wiki.openstreetmap.org/wiki/Osmfilter) and +[osmconvert](https://wiki.openstreetmap.org/wiki/Osmconvert) which can +be used as well. [Ogr2ogr](https://gdal.org/programs/ogr2ogr.html) +can't be used as it can't write the OSM XML format. To merge multiple +files with osmium, do this: + + osmium merge --overwrite -o outdata.osm *.osm.pbf + +The next step is to delete everything but highways from the OSM XML +file. When conflating highways, we don't care about amenities or +waterways. + + osmium tags-filter --overwrite --remove-tags -o outdata.osm indata.osm w/highway=track,service,unclassified,primary,tertiary,secondary,path,residential,abandoned,footway,motorway,trunk + +Finally I clip this large file into separate datasets, one for each +national forest. + + osmium extract --overwrite --polygon boundary.geojson -o outdata-roads.osm + Then the real fun starts after the drudgery of getting ready to do conflation. ![Blank Sign](images/20200726_103229.jpg){width=300 height=200} +#### Forest Road Names + +The names and reference number in OSM now have a wide variety of +[incorrect +tagging](https://wiki.openstreetmap.org/wiki/United_States_roads_tagging#National_Forest_Road_System) +when it comes to names. *"Forest Service Road 123.4A"* is not a name, +it is a reference number. Same for *"County Road 43"*. The +[fixname.py](https://github.com/hotosm/osm-merge/blob/tagging/utilities/fixnames.py) +utility scan the OSM extract and when it see incorrect tagging, +correct it to the OSM standard. Since the external datasets already +follow the same guidelines, this increases the chance of a good match +when conflating, since comparing names is part of the process. + +#### TIGER Tag Deletion + +Since there is community consensus that the *tiger:* tags added back +in 2008 when the TIGER data was imported are meaningless, so should be +deleted as bloat. The *fixnames.py* utility used for correct the name +alos deletes these from each feature so you don't have to manually do +it. + ### MVUM Roads This is all the highways in National Forests. The data contains @@ -232,9 +297,24 @@ the JOSM validators find many existing issues. I fix anything that is an error, and mostly ignore all the warning as that's a whole other project. -There are two primary ways to validate the conflation -output. Depending on the amount of data, sometimes one way is better -than the other, so it's good to be flexible. +If you are editing with the OSM XML file produced by conflation, when +the file is opened, there will be some conflicts. This is usually due +to things like the incorrect forest road name getting deleted, since +now it's a proper *ref:usfs* reference number. And the tiger tags are +gone as well if the *fixnames.py* utility is used. To fix the +conflicts, I just select them all, and click on *resolve to my +version*. Then I load all the ways into the +[TODO](https://wiki.openstreetmap.org/wiki/JOSM/Plugins/TODO_list) +plugin. + +Usng the plugin to validate a feature all I have to do is click on the +entry. Sometimes there will be issues that need to be manually +fixed. If conflation has changed the name, the old one is still in the +feature so a manual comparison can be done. Sometimes there are weird +typos that have slipped through the process. But many times for these +remote highways you can just mark it as done, and go on to the next +one. This lkets you validate a large number of features relatively +quickly without sacrifing quality. #### Editing OSM XML @@ -245,10 +325,11 @@ initially visible. If you go to the File menu, go down and execute *update modified*. This will download all the nodes for the ways, and all the highways will become visible. Highways that have multiple tags already in OSM will become a conflict. These can be resolved easier in -JOSM using the conflict dialog box. No geometries have change, just +JOSM using the conflict dialog box. No geometries have changed, just tags, so you have to manually select the tags to be merged. Features without tags beyond **highway=something** merge automatically. which -makes validating these features quick and easy. +makes validating these features quick and easy. Note that every +feature needs to be validated indivigually. #### Editing GeoJson From ef258eeef36e82ae59b235fdcdf400fae51f5b31 Mon Sep 17 00:00:00 2001 From: Rob Savoye Date: Mon, 19 Aug 2024 14:37:55 -0600 Subject: [PATCH 4/5] fix: Add bit on update modified in JOSM --- docs/highways.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/highways.md b/docs/highways.md index f289ca5..b2b8545 100644 --- a/docs/highways.md +++ b/docs/highways.md @@ -282,7 +282,11 @@ Unfortunately manually validating the data is very time consuming, but it's important to get it right. I use the *TODO* plugin and also a data filter so I just select highways. With the TODO plugin, I add the selected features, ideally the entire task. Then I just go through all -the features one at a time. +the features one at a time. When the OSM XML dataset is loaded, +nothing will appear in JOSM. This is because the OSM XML file produced +by conflation has the refs for the way, but lack the nodes. All it +takes is selecting the *update modified* menu itm under the *File* +menu and all the nodes get downloaded, and the highways appear. I often have the original datasets loaded as layers, since sometimes it's useful to refer back to when you find issues with the From f64dfde5f0584ca6b48996dea657ef53ac8c60db Mon Sep 17 00:00:00 2001 From: Rob Savoye Date: Mon, 19 Aug 2024 14:39:34 -0600 Subject: [PATCH 5/5] fix: Improved tag merging during conflation --- osm_merge/conflator.py | 191 ++++++++++++++++++++--------------------- 1 file changed, 95 insertions(+), 96 deletions(-) diff --git a/osm_merge/conflator.py b/osm_merge/conflator.py index c1742d1..d4e99fa 100755 --- a/osm_merge/conflator.py +++ b/osm_merge/conflator.py @@ -19,6 +19,7 @@ import logging import sys import os +import re from sys import argv from osm_fieldwork.osmfile import OsmFile from geojson import Point, Feature, FeatureCollection, dump, Polygon, load @@ -74,8 +75,8 @@ def distSort(data: list): """ return data['dist'] -def conflateThread(odkdata: list, - osmdata: list, +def conflateThread(primary: list, + secondary: list, threshold: float = 7.0, spellcheck: bool = True, ) -> list: @@ -83,8 +84,8 @@ def conflateThread(odkdata: list, Conflate features from ODK against all the features in OSM. Args: - odkdata (list): The features from ODK to conflate - osmdata (list): The existing OSM data + primary (list): The external dataset to conflate + seconday (list): The secondzry dataset, probably existing OSM data threshold (int): Threshold for distance calculations spellcheck (bool): Whether to also spell check string values @@ -115,23 +116,21 @@ def conflateThread(odkdata: list, cutils = Conflator() i = 0 # Progress bar - pbar = tqdm.tqdm(odkdata) + pbar = tqdm.tqdm(primary) for entry in pbar: - # for entry in odkdata: + # for entry in primary: i += 1 timer.start() confidence = 0 maybe = list() - # if "attrs" in entry: - # odk = cutils.osmToFeature(entry) - # else: - # odk = entry - for existing in osmdata: + for existing in secondary: odktags = dict() osmtags = dict() feature = dict() newtags = dict() + # log.debug(f"ENTRY: {entry["properties"]}") + # log.debug(f"EXISTING: {existing["properties"]}") geom = None # We could probably do this using GeoPandas or gdal, but that's # going to do the same brute force thing anyway. @@ -157,9 +156,9 @@ def conflateThread(odkdata: list, #breakpoint() if dist <= threshold: - # log.debug(f"DIST: {dist / 1000}km. {dist}m") - # log.debug(f"ENTRY: {entry["properties"]}") - # log.debug(f"EXISTING: {existing["properties"]}") + log.debug(f"DIST: {dist / 1000}km. {dist}m") + log.debug(f"PRIMARY: {entry["properties"]}") + log.debug(f"SECONDARY : {existing["properties"]}") if dist <= 0.0: # Probably an exact hit, likely from data imported # into OSM from the same source. @@ -194,7 +193,8 @@ def conflateThread(odkdata: list, if maybe[0]["dist"] >= 0.0: hits, tags = cutils.checkTags(maybe[0]["odk"], maybe[0]["osm"]) # log.debug(f"TAGS: {hits}: {tags}") - tags['fixme'] = "Don't upload this to OSM without validation!" + # osmfile.py add this as a note tag + # tags['fixme'] = "Don't upload this to OSM without validation!" if "refs" in maybe[0]["osm"]["properties"]: tags["refs"] = maybe[0]["osm"]["properties"]["refs"] geom = maybe[0]["odk"]["geometry"] @@ -210,6 +210,7 @@ def conflateThread(odkdata: list, # If no hits, it's new data. ODK data is always just a POI for now else: entry["properties"]["version"] = 1 + entry["properties"]["informal"] = "yes" entry["properties"]["fixme"] = "New features should be imported following OSM guidelines." newdata.append(entry) @@ -312,78 +313,61 @@ def checkTags(self, (dict): The updated tags """ match_threshold = 80 - keep = ["surface", - "name", - "ref", - "ref:usfs", - "smoothness", - "highway", - "tracktype"] + match = ["name", "ref", "ref:usfs"] hits = 0 props = dict() id = 0 version = 0 - for key, value in extfeat['properties'].items(): - if key not in keep: - # Reduce the amount of tags needed for a match since some of - # the OSM data is bloated. + props = extfeat['properties'] | osm['properties'] + # ODK Collect adds these two tags we don't need. + if "title" in props: + del props["title"] + if "label" in props: + del props["label"] + + if "id" in props: + # External data not from an OSM source always has + # negative IDs to distinguish it from current OSM data. + id = int(props["id"]) + else: + id -= 1 + props["id"] = id + + if "version" in props: + # Always use the OSM version if it exists, since it gets + # incremented so JOSM see it's been modified. + props["version"] = int(version) + # Name may also be name:en, name:np, etc... There may also be + # multiple name:* values in the tags. + else: + props["version"] = 1 + + for key in match: + if "highway" in osm["properties"]: + # Always use the value in the secondary, which is + # likely in OSM. + props["highway"] = osm["properties"]["highway"] + if key not in props: continue - if key in osm["properties"]: - if key == "title" or key == "label": - # ODK data extracts have an title and image tags, - # which is usually just a duplicate of the name, - # so drop those. - continue - elif key == "osm_id" or key == "id": - # External data not from an OSM source always has - # negative IDs to distinguish it from current OSM data. - if value <= 0: - id = int(osm["properties"][key]) - else: - id = int(value) - props["id"] = id - continue - elif key == "version": - # Always use the OSM version, since it gets incremented - # so JOSM see it's been modified. - version = int(osm["properties"][key]) - props["version"] = version - continue - # Name may also be name:en, name:np, etc... There may also be - # multiple name:* values in the tags. - elif key[:4] == "name": - # Usually it's the name field that has the most variety in - # in trying to match strings. This often is differences in - # capitalization, singular vs plural, and typos from using - # your phone to enter the name. Course names also change - # too so if it isn't a match, use the new name from the - # external dataset. - ratio = fuzz.ratio(value.lower(), osm["properties"][key].lower()) - if ratio > match_threshold: - hits += 1 - props["ratio"] = ratio - props[key] = value - if value != osm["properties"][key]: - props[f"old_{key}"] = osm["properties"][key] - else: - if key != 'note': - props[key] = value - else: - # All the other keys are usually a defined OSM tag. - # Course the new value is probably more up to data - # than what is in OSM. Keep both in the properties - # for debugging tag conflation. - if key == "title" or key == "label": - continue - props[key] = value - if value != osm["properties"][key]: + # Usually it's the name field that has the most variety in + # in trying to match strings. This often is differences in + # capitalization, singular vs plural, and typos from using + # your phone to enter the name. Course names also change + # too so if it isn't a match, use the new name from the + # external dataset. + if key in osm["properties"] and key in extfeat["properties"]: + ratio = fuzz.ratio(extfeat["properties"][key].lower(), osm["properties"][key].lower()) + if ratio > match_threshold: + hits += 1 + props["ratio"] = ratio + props[key] = extfeat["properties"][key] + if ratio != 100: + # For a fuzzy match, cache the value from the + # secondary dataset and use the value in the + # primary dataset. props[f"old_{key}"] = osm["properties"][key] - else: - hits += 1 - else: - # Add the tag from the new data since it isn't in OSM yet. - props[key] = value + # print(props) return hits, props def loadFile( @@ -428,6 +412,10 @@ def loadFile( if "tag" in node: for tag in node["tag"]: if type(tag) == dict: + # Drop all the TIGER tags based on + # https://wiki.openstreetmap.org/wiki/TIGER_fixup + if properties[tag["@k"]][:7] == "tiger:": + continue properties[tag["@k"]] = tag["@v"].strip() # continue else: @@ -593,9 +581,9 @@ async def conflateData(self, tasks = list() # Make threading optional for easier debugging - single = False + single = False # True if single: - conflateThread(odkdata, osmdata) + alldata = conflateThread(odkdata, osmdata) else: futures = list() with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor: @@ -617,7 +605,9 @@ async def conflateData(self, return alldata def dump(self): - """Dump internal data""" + """ + Dump internal data for debugging. + """ print(f"Data source is: {self.dburi}") print(f"There are {len(self.data)} existing features") # if len(self.versions) > 0: @@ -627,6 +617,15 @@ def dump(self): def parseFile(self, filespec: str, ) ->list: + """ + Parse the input file based on it's format. + + Args: + filespec (str): The file to parse + + Returns: + (list): The parsed data from the file + """ odkpath = Path(filespec) odkdata = list() if odkpath.suffix == '.geojson': @@ -852,10 +851,10 @@ async def main(): """, ) parser.add_argument("-v", "--verbose", action="store_true", help="verbose output") - parser.add_argument("-e", "--extract", help="The base OSM data") + parser.add_argument("-s", "--secondary", help="The secondary dataset") parser.add_argument("-q", "--query", help="Custom SQL when using a database") parser.add_argument("-c", "--config", default="highway", help="The config file for the SQL query") - parser.add_argument("-s", "--source", required=True, help="The external data to conflate") + parser.add_argument("-p", "--primary", required=True, help="The primary dataset") parser.add_argument("-t", "--threshold", default=1, help="Threshold for distance calculations") parser.add_argument("-o", "--outfile", default="conflated.geojson", help="Output file from the conflation") parser.add_argument("-b", "--boundary", help="Optional boundary polygon to limit the data size") @@ -875,7 +874,7 @@ async def main(): ch.setFormatter(formatter) log.addHandler(ch) - if not args.extract and not args.uri: + if not args.secondary and not args.uri: parser.print_help() log.error("You must supply a database URI or a data extract file!") quit() @@ -891,24 +890,24 @@ async def main(): else: toplevel = Path(args.source) - conflate = Conflator(args.extract, args.boundary) - if args.extract[:3].lower() == "pg:": - await conflate.initInputDB(args.config, args.extract[3:]) + conflate = Conflator(args.secondary, args.boundary) + if args.secondary[:3].lower() == "pg:": + await conflate.initInputDB(args.config, args.secondary[3:]) - if args.source[:3].lower() == "pg:": - await conflate.initInputDB(args.config, args.source[3:]) + if args.primary[:3].lower() == "pg:": + await conflate.initInputDB(args.config, args.secondary[3:]) - data = await conflate.conflateData(args.source, args.extract, float(args.threshold)) + data = await conflate.conflateData(args.primary, args.secondary, float(args.threshold)) - path = Path(args.outfile) - jsonout = f"{path.stem}-out.geojson" - osmout = f"{path.stem}-out.osm" + # path = Path(args.outfile) + jsonout = args.outfile.replace(".geojson", "-out.geojson") + osmout = args.outfile.replace(".geojson", "-out.osm") conflate.writeOSM(data, osmout) conflate.writeGeoJson(data, jsonout) log.info(f"Wrote {osmout}") - log.info(f"Wrote {args.outfile}") + log.info(f"Wrote {jsonout}") if __name__ == "__main__": """This is just a hook so this file can be run standlone during development."""