diff --git a/README.md b/README.md index e562b01..1f83b6d 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,6 @@ # OSM Merge -

- HOT -

Merge features and tags into existing OSM data.

@@ -51,20 +48,54 @@ with the ultimate goal of importing it into [OpenStreetMap](https://www.openstreetmap.org). It is oriented towards processing non OSM external datasets. +This project uses a huge amount of data (and disk space) if you start +from the original nation wide datasets, which are too large to +edit. There is a contrib script in the git sources I use to start +breaking down the huge files into managable pieces. + +The goal of this project is two-fold. One is to support field data +collection using OpenDataKit. The +[osm-fieldwork](https://hotosm.github.io/osm-fieldwork/) project can +be used to convert the ODK data files into GeoJson and OSM XML. This +project then supports conflating that field collected data with +current OpenStreetMap. Otherwise this is a time-consuming process to +do manually. + +The other goal is focused on emergency access in remote areas. This is +improving the Motor Vehicle Use Map (MVUM) datasets of all highways +(mostly jeep trails) in OpenStreetMap. The current data in OSM was +often imported complete with bugs in the original dataset, or the only +details are *highway=track*. All of these have a US forest service +reference number and name. Adding those makes it much easier to +communicate a location. + +![Blank Sign](assets/20210913_113539.jpg){width=300 height=200} + ## Programs -### conflateBuildings.py +### conflator.py + +This program doesn't require a database, unlike the other conflation +programs. It is focused on [conflation OpenDataKit](odkconflation.md) +with OpenStreetMap, as well as conflating [rural +highways](highways.md). It'll conflate any two datasets in either GeoJson +or OSM XML format. While this is currently under heavy development and +debugging by processing large amounts of data to track down all the +obscure bugs in the original datasets, or the conflation process. + +### The Data -This looks for duplicate buildings both in the -external dataset, and in OSM. This has been used with -multiple datasets, namely the Microsoft ML Building -footprints, Overture, and others. +Much of the process of conflation is splitting huge datasets into +managable sized files. Since that process is [mostly +automated](https://github.com/hotosm/osm-merge/tree/main/contrib), I +have a collection of files where I have done that part. Since +conflation also requires converting the original datasets, the +original files are included, the converted files to OSM XML & GeoJson, +and the results of conflation. Not all the national forests and parks +have been conflated yet, but the data is there for others that may +wish to try. The [Map +Data](https://www.senecass.com/projects/Mapping/SourceData/States/) is +on a slow server, sorry. Disk space is cheaper than network bandwidth. -### conflatePOI.py +![Blank Sign](assets/skippedsegments.png){width=300 height=200} -This looks to find a building when the data is only a POI. Many -external datasets are a list of buildings, like schools or -hospitals. In OSM the only metadata for the feature may be -_building=yes_. Also field collected data with ODK Collect is also a -POI, and we want the data that was collected to be merged with any -existing OSM features. diff --git a/contrib/update.sh b/contrib/update.sh index 755027a..23e9242 100755 --- a/contrib/update.sh +++ b/contrib/update.sh @@ -22,11 +22,15 @@ states="Utah Colorado Wyoming" # Top level for boundaries, obviously system specific -boundaries="../../Boundaries/" +boundaries="/play/MapData/Boundaries/" + +# The git branch, usually main except when debugging +branch="nomatch" +root="${HOME}/projects/HOT/osm-merge.git" # FIXME: script in the utilities directory don't get install by pip -fixname="~/projects/HOT/osm-merge.git/trails/utilities/fixnames.py" -tmsplitter="/home/rob/projects/HOT/osm-merge.git/test/utilities/tm-splitter.py" +fixnames="${root}/${branch}/utilities/fixnames.py" +tmsplitter="${root}/${branch}/utilities/tm-splitter.py" dryrun="" # echo @@ -36,13 +40,19 @@ mvumhighways="../Road_MVUM-out.geojson" mvumtrails="../Trail_MVUM-out.geojson" topohighways="../USGS_Topo_Roads-out.geojson" topotrails="../USGS_Topo_Trails-out.geojson" +npstrails="../National_Park_Service_Trails-out.geojson" +usfstrails="../USFS_Trails-out.geojson" utah="Dixie_National_Forest \ Bryce_Canyon_National_Park \ Zion_National_Park \ Capitol_Reef_National_Park \ Arches_National_Park \ - Manti_La_Sal_National_Forest" + Manti_La_Sal_National_Forest \ + Canyonlands_National_Park \ + Uinta_Wasatch_Cache_National_Forest \ + Fishlake_National_Forest \ + Ashley_National_Forest" colorado="Arapaho_and_Roosevelt_National_Forests \ Medicine_Bow_Routt_National_Forest \ @@ -68,23 +78,21 @@ datasets["Wyoming"]="${wyoming}" # declare -p ${datasets} -source="USFS_MVUM_Roads \ - USFS_MVUM_Trails \ - USFS_Trails \ - USFS_MVUM \ - USGS_Topo_Roads \ - USGS_Topo_Trails \ - OSM_Highways" - basesets="no" split_aoi() { - # fmtm-spliiter is available from here: https://hotosm.github.io/fmtm-splitter/ + # $1 is an optional state to be the only one processed + # $2 is an optional nartional forest or park to be the only one processed + # These are mostly useful for debugging, by default everything is processed + region="${1:-${states}}" + dataset="${2:all}" tmmax=70000 - echo "Splitting ${1} into squares with ${tmmax} per side" - for state in ${states}; do - echo "Processing ${state} public lands..." + for state in ${region}; do + echo "Splitting ${state} into squares with ${tmmax} per side" for land in ${datasets["${state}"]}; do + if test x"${dataset}" != x -a x"${dataset}" != x"${land}"; then + continue + fi echo " Making TM sized projects for ${land}" if test $(echo ${land} | grep -c "_Park" ) -gt 0; then aoi="${boundaries}/NationalParks/${land}.geojson" @@ -108,12 +116,20 @@ make_sub_tasks() { # a small TM task. These are used to make small task sized # data extracts of the post conflated data to avoid lots of # cut & paste. + # $1 is an optional state to be the only one processed + # $2 is an optional nartional forest or park to be the only one processed + # These are mostly useful for debugging, by default everything is processed + region="${1:-${states}}" + dataset="${2:all}" tmmax=8000 - for state in ${states}; do + for state in ${region}; do echo "Processing public lands in ${state}..." for land in ${datasets["${state}"]}; do + if test x"${dataset}" != x -a x"${dataset}" != x"${land}"; then + continue + fi echo " Making task boundaries for clipping to ${land}" - for task in ${state}/${land}_Tasks/${land}*_Tasks*.geojson; do + for task in ${state}/${land}_Tasks/${land}_Tasks*.geojson; do num=$(echo ${task} | grep -o "Tasks_[0-9]*" | sed -e "s/Tasks/Task/") base=$(echo ${task} | cut -d '.' -f 1) short=$(basename ${base} | sed -e "s/National.*Tasks/Task/") @@ -123,9 +139,10 @@ make_sub_tasks() { # fmtm-splitter -v -b ${task} -m ${tmmax} echo " Making sub task boundaries for ${task}" ${tmsplitter} --grid --infile ${task} --threshold 0.1 - ogr2ogr -nlt POLYGON -makevalid -clipsrc ${task} ${base}/${short}_Tasks.geojson output.geojson + ogr2ogr -nlt MULTIPOLYGON -makevalid -clipsrc ${task} ${base}/${short}_Tasks.geojson output.geojson ${tmsplitter} -v --split --infile ${base}/${short}_Tasks.geojson mv -f ${short}*.geojson ${base} + exit done done done @@ -134,12 +151,27 @@ make_sub_tasks() { make_tasks() { # Split the multipolygon from fmtm-splitter into indivigual files # for ogr2ogr and osmium. - for state in ${states}; do - echo "Processing public lands in ${state}..." + # $1 is an optional state to be the only one processed + # $2 is an optional nartional forest or park to be the only one processed + # These are mostly useful for debugging, by default everything is processed + region="${1:-${states}}" + dataset="${2:all}" + for state in ${region}; do + echo "Making task boundaries for for ${state}..." for land in ${datasets["${state}"]}; do + if test x"${dataset}" != x -a x"${dataset}" != x"${land}"; then + continue + fi + if test ! -e ${state}/${land}_Tasks; then + mkdir ${state}/${land}_Tasks + fi echo " Making task boundaries for clipping to ${land}" - ${tmsplitter} -v -s -i ${state}/${land}_Tasks.geojson - mv ${land}_Tasks*.geojson ${state}/${land}_Tasks/ + ${tmsplitter} -v -s -i ${state}/${land}_Tasks.geojson + for task in ${state}/${land}_Tasks/${land}_Tasks_*.geojson; do + num=$(echo ${task} | grep -o "Tasks_[0-9]*" | sed -e "s/Tasks/Task/") + rm -f ${state}/${land}_Tasks/${num}.geojson + echo "Wrote ${state}/${land}_Tasks/${num}.geojson ..." + done done done } @@ -187,16 +219,75 @@ make_sub_osm() { } make_nps_extract() { - echo "make_nps_extract() - unimplemented" -} + # Make the data extract for the public land from MVUM + # $1 is whether to make the huge data extract for all tasks + # $2 is whether to make smaller task extracts from the big one + base="${1-yes}" + tasks="${2-yes}" + # Make the data extract from the NPS Trail data + for state in ${states}; do + echo "Processing NPS data in ${state}..." + for land in ${datasets["${state}"]}; do + if test $(echo ${land} | grep -c "_Forest" ) -gt 0; then + continue + fi + if test x"${base}" == x"yes"; then + echo " Making ${land}_NPS_Trails.geojson" + rm -f ${state}/${land}_Tasks/${land}_NPS_Trails.geojson + ${dryrun} ogr2ogr -explodecollections -makevalid -clipsrc ${boundaries}/NationalParks/${land}.geojson ${state}/${land}_Tasks/${land}_NPS_Trails.geojson ${npstrails} + fi -make_sub_nps() { - echo "make_sub_nps() unimplemented" + if test x"${tasks}" == x"yes"; then + for task in ${state}/${land}_Tasks/*_Tasks*.geojson; do + echo " Processing NPS task ${task}..." + num=$(echo ${task} | grep -o "Tasks_[0-9]*" | sed -e "s/Tasks/Task/") + rm -f ${state}/${land}_Tasks/${land}_NPS_${num}.geojson + ${dryrun} ogr2ogr -explodecollections -makevalid \ + -clipsrc ${task} ${state}/${land}_Tasks/${land}_NPS_${num}.geojson \ + ${state}/${land}_Tasks/${land}_NPS_Trails.geojson + done + fi + done + done } make_topo_extract() { - echo "make_topo_extract() unimplemented" + # Make the data extract for the public land from MVUM + # $1 is whether to make the huge data extract for all tasks + # $2 is whether to make smaller task extracts from the big one + base="${1-yes}" + tasks="${2-yes}" + for state in ${states}; do + echo "Processing Topo data in ${state}..." + for land in ${datasets["${state}"]}; do + if test $(echo ${land} | grep -c "_Park" ) -gt 0; then + clip="NationalParks" + else + clip="NationalForests" + fi + if test x"${base}" == x"yes"; then + echo " Making ${land}_NPS_Trails.geojson" + rm -f ${forest}_Tasks/${land}_Topo_Trails.geojson + ${dryrun} ogr2ogr -explodecollections -makevalid -clipsrc ${boundaries}/${clip}/${land}.geojson ${state}/${land}_Tasks/${land}_USGS_Topo_Roads.geojson ${topotrails} + + rm -f ${forest}_Tasks/${land}_Topo_Trails.geojson + ${dryrun} ogr2ogr -explodecollections -makevalid -clipsrc ${boundaries}/${clip}/${land}.geojson ${state}/${land}_Tasks/${land}_USGS_Topo_Trails.geojson ${topohighways} + fi + + if test x"${tasks}" == x"yes"; then + for task in ${state}/${land}_Tasks/*_Tasks*.geojson; do + echo " Processing Topo task ${task}..." + num=$(echo ${task} | grep -o "Tasks_[0-9]*" | sed -e "s/Tasks/Task/") + rm -f ${state}/${land}_Tasks/${land}_USGS_Topo_${num}.geojson + ${dryrun} ogr2ogr -explodecollections -makevalid -clipsrc ${task} ${state}/${land}_Tasks/${land}_USGS_Topo_${num}.geojson ${state}/${land}_Tasks/${land}_USGS_Topo_Roads.geojson + done + fi + done + done +} + +make_sub_nps() { + echo "make_sub_nps() unimplemented" } make_sub_topo() { @@ -207,11 +298,21 @@ make_mvum_extract() { # Make the data extract for the public land from MVUM # $1 is whether to make the huge data extract for all tasks # $2 is whether to make smaller task extracts from the big one - base="${1-yes}" - tasks="${2-yes}" - for state in ${states}; do + # $1 is an optional state to be the only one processed + # $2 is an optional nartional forest or park to be the only one processed + # These are mostly useful for debugging, by default everything is processed + region="${1:-${states}}" + dataset="${2:-all}" + # base=${3:-no} + base="yes" + # tasks=${4:-no} + tasks="yes" + for state in ${region}; do echo "Processing MVUM data in ${state}..." for land in ${datasets["${state}"]}; do + if test x"${dataset}" != x"all" -a x"${dataset}" != x"${land}"; then + continue + fi if test $(echo ${land} | grep -c "_Park" ) -gt 0; then clip="NationalParks" else @@ -247,23 +348,35 @@ make_mvum_extract() { make_osm_extract() { # Make the data extract for the public land from OSM - # $1 is whether to make the huge data extract for all tasks - # $2 is whether to make smaller task extracts from the big one - base="${1-yes}" - tasks="${2-yes}" - for state in ${states}; do + # $1 is an optional state to be the only one processed + # $2 is an optional nartional forest or park to be the only one processed + # These are mostly useful for debugging, by default everything is processed + region="${1:-${states}}" + dataset="${2:-all}" + # base=${3:-no} + base="yes" + # tasks=${4:-no} + tasks="yes" + defaults="extract -s smart --overwrite " + for state in ${region}; do echo "Extracting ${state} public lands from OSM..." for land in ${datasets["${state}"]}; do - # if test ! -e ${state}/${land}_Tasks; then - # mkdir -p ${state}/${land}_Tasks - # fi + if test x"${dataset}" != x"all" -a x"${dataset}" != x"${land}"; then + continue + fi + if test ! -e ${state}/${land}_Tasks; then + mkdir -p ${state}/${land}_Tasks + fi if test x"${base}" == x"yes"; then - echo " Clipping OSM data for ${land}..." + # echo " Clipping OSM data for ${land}..." if test $(echo ${land} | grep -c "_Park" ) -gt 0; then - ${dryrun} osmium extract -s smart --overwrite --polygon ${boundaries}/NationalParks/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways.osm ${osmdata} + ${dryrun} osmium ${defaults} --polygon ${boundaries}/NationalParks/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways.osm ${osmdata} else - ${dryrun} osmium extract -s smart --overwrite --polygon ${boundaries}/NationalForests/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways.osm ${osmdata} + ${dryrun} osmium ${defaults} --polygon ${boundaries}/NationalForests/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways.osm ${osmdata} fi + # # Fix the names & refs in the OSM data + ${fixnames} -v -i ${state}/${land}_Tasks/${land}_OSM_Highways.osm + mv out-out.osm ${state}/${land}_Tasks/${land}_OSM_Highways.osm fi if test x"${tasks}" == x"yes"; then @@ -271,9 +384,9 @@ make_osm_extract() { echo " Processing OSM task ${task}..." num=$(echo ${task} | grep -o "Tasks_[0-9]*" | sed -e "s/Tasks/Task/") if test $(echo ${land} | grep -c "_Park" ) -gt 0; then - ${dryrun} osmium extract -s smart --overwrite --polygon ${boundaries}/NationalParks/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways_${num}.osm ${state}/${land}_Tasks/${land}_OSM_Highways.osm + ${dryrun} osmium ${defaults} --polygon ${boundaries}/NationalParks/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways_${num}.osm ${state}/${land}_Tasks/${land}_OSM_Highways.osm else - ${dryrun} osmium extract -s smart --overwrite --polygon ${boundaries}/NationalForests/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways_${num}.osm ${state}/${land}_Tasks/${land}_OSM_Highways.osm + ${dryrun} osmium ${defaults} --polygon ${boundaries}/NationalForests/${land}.geojson -o ${state}/${land}_Tasks/${land}_OSM_Highways_${num}.osm ${state}/${land}_Tasks/${land}_OSM_Highways.osm fi done fi @@ -301,102 +414,11 @@ make_baseset() { done } -# process_forests() { -# for forest in ${forests}; do -# if test "x${basesets}" = "xyes"; then -# echo " Making ${forest}_Tasks/${forest}_USFS_MVUM_Roads.geojson" -# rm -f ${forest}_Tasks/${forest}_USFS_MVUM_Roads.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${forest}.geojson ${forest}_Tasks/${forest}_USFS_MVUM_Roads.geojson /play/MapData/SourceData/Road_MVUM-out.geojson - -# echo " Making ${forest}_Tasks/${forest}_USFS_MVUM_Trails.geojson" -# rm -f ${forest}_Tasks/${forest}_USFS_MVUM_Trails.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${forest}.geojson ${forest}_Tasks/${forest}_USFS_MVUM_Trails.geojson /play/MapData/SourceData/Trail_MVUM-out.geojson - -# # Merge the MVUM roads and trails together, since in OSM they -# # are both. -# rm -f ${forest}_Tasks/${forest}_USFS_MVUM.geojson -# ogrmerge.py -nln mvum -o ${forest}_Tasks/mvum.geojson ${forest}_Tasks/${forest}_USFS_MVUM_Trails.geojson -# ogrmerge.py -nln mvum -append -o ${forest}_Tasks/mvum.geojson ${forest}_Tasks/${forest}_USFS_MVUM_Roads.geojson - -# echo " Making ${forest}_Tasks/${forest}_USFS_Trails.geojson" -# rm -f ${forest}_Tasks/${forest}_USFS_Trails.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${forest}.geojson ${forest}_Tasks/${forest}_USFS_Trails.geojson /play/MapData/SourceData/USFS_Trails-out.geojson - -# # echo " Making ${forest}_Tasks/${forest}_USFS_MVUM.geojson" -# # rm -f ${forest}_Tasks/${forest}_USFS_Trails.geojson -# # ogr2ogr -explodecollections -makevalid -clipsrc ${forest}.geojson ${forest}_Tasks/${forest}_USFS_MVUM.geojson ${forest}.geojson ${forest}_Tasks/mvum.geojson - -# echo " Making ${forest}_Tasks/${forest}_USGS_Topo_Roads.geojson" -# rm -f ${forest}_Tasks/${forest}_USGS_Topo_Roads.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${forest}.geojson ${forest}_Tasks/${forest}_USGS_Topo_Roads.geojson /play/MapData/SourceData/USGS_Topo_Roads-out.geojson - -# echo " Making ${forest}_Tasks/${forest}_USGS_Topo_Trails.geojson" -# rm -f ${forest}_Tasks/${forest}_USGS_Topo_Trails.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${forest}.geojson ${forest}_Tasks/${forest}_USGS_Topo_Trails.geojson /play/MapData/SourceData/USGS_Topo_Trails-out.geojson -# osmium extract -s smart --overwrite --polygon ${forest}.geojson -o ${forest}_Tasks/${forest}_OSM_Roads.osm ../wy-co-ut.osm -# fi - -# for task in ${forest}_Tasks/*Tasks_[0-9].geojson; do -# base=$(echo ${task} | sed -e 's/Tasks_[0-9]*.geojson//') -# num=$(echo ${task} | grep -o "Tasks_[0-9]*") -# echo " Processing task ${task}" -# for dataset in ${source}; do -# echo " Processing dataset ${dataset}..." -# new="${base}_${num}_${dataset}.geojson" -# rm -f ${new} -# if test ${dataset} == "OSM_Roads"; then -# osmium extract -s smart --overwrite --polygon ${task} -o ${base}_${num}_OSM_Roads.osm ${base}_${dataset}.osm -# elif test ${dataset} == "USFS_MVUM"; then -# ogr2ogr -explodecollections -clipsrc ${task} ${new} mvum.geojson -# else -# ogr2ogr -explodecollections -clipsrc ${task} ${new} ${base}_${dataset}.geojson -# fi -# done -# done -# done -# } - -# process_parks() { -# source="National_Park_Service Trails USGS_Topo_Trails" -# for park in ${parks}; do -# echo "Processing national park ${park}, making base datasets..." -# if test "x${basesets}" = "xyes"; then -# rm -f ${park}_Tasks/${park}_USGS_Topo_Trails-out.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${park}.geojson ${park}_Tasks/${park}_USGS_Topo_Trails.geojson /play/MapData/SourceData/USGS_Topo_Trails-out.geojson - -# rm -f ${park}_Tasks/${park}_NPS_Trails.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${park}.geojson ${park}_Tasks/${park}_NPS_Trails.geojson /play/MapData/SourceData/National_Park_Service_Trails-out.geojson - -# osmium extract -s smart --overwrite --polygon ${park}.geojson -o ${park}_Tasks/${park}_OSM_Trails.osm ../wy-co-ut.osm -# fi - -# for task in ${park}_Tasks/*Tasks_[0-9].geojson; do -# base=$(echo ${task} | sed -e 's/Tasks_[0-9]*.geojson//') -# num=$(echo ${task} | grep -o "Tasks_[0-9]*") -# echo " Processing task ${task}" -# for dataset in ${source}; do -# echo " Processing dataset ${dataset}..." -# new="${base}_${num}_${dataset}.geojson" -# rm -f ${park}_Tasks/${park}_${num}_NPS_Trails.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${park}.geojson ${park}_Tasks/${park}_${num}_NPS_Trails.geojson ${park}_Tasks/${park}_NPS_Trails.geojson - -# rm -f ${park}_Tasks/${park}_${num}_USGS_Topo_Trails.geojson -# ogr2ogr -explodecollections -makevalid -clipsrc ${park}.geojson ${park}_Tasks/${park}_${num}_USGS_Topo_Trails.geojson ${park}_Tasks/${park}_USGS_Topo_Trails.geojson - -# # osmium extract --no-progress --overwrite --polygon ${task} -o ${base}_${num}_OSM_Trails.osm ${base}_OSM_Trails.osm -# osmium extract -s smart --overwrite --polygon ${task} -o ${num}_OSM_Trails.osm ${base}_OSM_Trails.osm -# osmium tags-filter --overwrite -o ${base}_${num}_OSM_Trails.osm ${num}_OSM_Trails.osm w/highway=path -# # rm -f ${num}_OSM_Trails.osm -# done -# done -# done -# } - fixnames() { # Fix the OSM names osm=$(find -name \*.osm) for area in ${osm}; do - ${fixnames}h -v -i ${area} + ${fixnames} -v -i ${area} done } @@ -431,53 +453,66 @@ while test $# -gt 0; do exit 0 ;; -b|--base) - basesets="yes" - make_baseset + basedata="yes" + # make_baseset break ;; + -o|--only) + shift + region=$1 + ;; -s|--split) - split_aoi + split_aoi ${region} ${dataset} break ;; + -d|--datasets) + shift + dataset=$1 + ;; -t|--tasks) - make_tasks - make_sub_tasks + make_tasks ${region} ${dataset} + # make_sub_tasks ${region} ${dataset} break ;; - -o|--only) - states=$1 - ;; -f|--forests) make_sub_mvum break # process_forests ;; - -d|--datasets) - basesets="no" - ;; -c|--clean) clean_tasks break ;; -e|--extract) - # make_osm_extract ${basesets} yes - make_sub_osm ${basesets} yes - # make_mvum_extract ${basesets} yes + # This runs for a long time. + make_osm_extract ${region} ${dataset} ${basedata} + # make_sub_osm ${basesets} yes + make_mvum_extract ${region} ${dataset} ${basedata} # make_sub_mvum - make_nps_extract - make_sub_nps - make_topo_extract - make_sub_topo + # make_nps_extract + # make_sub_nps + # make_topo_extract + # make_sub_topo break ;; -a|--all) + # The kitchen sink, do everything split_aoi make_tasks make_sub_tasks make_osm_extract ${basesets} yes - # make_sub_osm + make_sub_osm ${basesets} yes make_mvum_extract ${basesets} yes - # make_sub_mvum + make_sub_mvum + make_nps_extract + make_sub_nps + make_topo_extract + make_sub_topo + break + ;; + -w) + make_nps_extract + make_topo_extract ;; *) # process_forests diff --git a/docs/calculations.md b/docs/calculations.md new file mode 100644 index 0000000..2d4305a --- /dev/null +++ b/docs/calculations.md @@ -0,0 +1,103 @@ +# Conflation Calculations + +Part of the fun of external datasets, especially some that have been +around long time like the MVUM data is the the variety of +inconsistencies in the data. While OpenStreetMap itself is a bit +overly flexible at time, so is external data. And some of the old data +has been converted from other formats several times, with bugs getting +introduced each time. + +## Geometries + +OpenStreetMap has relations, which are a collection of references to +other features. External data may have LineStrings, MultiLineStrings +or a GeometryCollection, all in the same file! For all calculations +the MultiLineString and GeometryCollections are taken apart, so the +calculations are between OSM data and that segment of the external +data. Since this may product multiple values, those need to be +evaluated and the most likely one returned. + +## Distance + +A simple distance calculation is performed after transforming the +coordinate system from global degrees to meters. The result is +compared to a threshold distance, and any feature within that +threshold is added to a list of possible matches. After a few features +are found in the required distance, matching stops and then the next +feature to be conflated is started on the same process. + +If the highway is a GeometryCollection or MultiLineString, then it's +split into segments, and each one is checked for distance. The closest +one is what is returned. + +## Slope and Angle + +Distance often will return features that are close to each other, but +often they are spur roads off the more major one. So when two highway +segments are found close to each other, the angle between them is +calculated. This works well to differentiate between the more major +highway, and the spur road that splits off from that. + +If the highway is a GeometryCollection or MultiLineString, then it's +split into segments, and each one is checked for the angle. The +closest one is what is returned. + +## Tag Checking + +Once there is at least one candidate within the parameters of distance +and angle, then the tags are checked for matches. The tags we are +primarily intereted in are name(s) and reference number(s) of each +MVUM road or trail. Some of the existing features in OpenStreetMap may +be inaccurate as to the proper name and reference. And of course each +feature may have an *alt_name* or both a *ref* and a *ref:usfs*. Due +to the wonders of inconsistent data, a fuzzy string comparison is +done. This handles most of the basic issues, like capitalization, one +or 2 characters difference, etc... Anything above the threshold is +considered a probably match, and increments a counter. This value is +included in the conflated results, and is often between 1-3. + +The reference numbers between the two datasets is also compared. There +is often a reference number in OSM already, but no name. The external +dataset has the name, so we want to update OSM with that. In addition, +the external datasets often have access information. Seasonal access, +private land, or different types of vehicles which can be added to +OSM. + +### Tag Merging + +The conflation process for merging tags uses the concept of primary +and secondary datasets. The primary is considered to have the *true* +value for a highway or trail. For example, if the name in the two +datasets doesn't match, the secondary will then rename the current +value to *old_something*. The primary's version becomes the same. Some +with reference numbers. + +Other tags from the primary can also be merged, overriding what is +currently in OSM. Once again, the old values are renamed, not +deleted. When validating in JOSM, you can see both versions and make a +final determination as to what is the correct value. Often it's just +spelling differences. + +For all the features in OSM that only have a *highway=something* as a +tag, all the desired tags from the primary dataset are added. + +For some tags like *surface* and *smoothness*, the value in OSM is +potentially more recent, so those are not updated. For any highway +feature lacking those tags, they get added. + +Optionally the various access tags for *private*, *atv*, *horse*, +*motorcycle*, etc... are set in the post conflation dataset if they +have a value in the external dataset. + +### Debug Tags + +Currently a few tags are added to each feature to aid in validating +and debugging the conflation process. These should obviously be +removed before uploading to OSM. They'll be removed at a future date +after more validation. These are: + +* hits - The number of matching tags in a feature +* ratio - The ratio for name matching if not 100% +* dist - The distance between features +* angle - The angle between two features +* slope - The slope between two features diff --git a/docs/results.md b/docs/results.md deleted file mode 100644 index a01d0f4..0000000 --- a/docs/results.md +++ /dev/null @@ -1,43 +0,0 @@ -# Conflation Results In Kenya - -## Project 12007 - -Good quality conflation over all. There seems to be a parking lot of -semi-trucks misidentifed as buildings. Another is a pond identified as -a building. -### 90 new buildings - -## Project 12012 - -Good quality conflation over all. Some fenced yards are being -identified as a building. -### 374 new buildings - -## Project 12013 - -Good quality conflation over all. In dense area there is enough space -between buildings. -### 177 new buildings - -## Project 12057 - -## Project 12091 - -## Project 12092 -FIXME: Already imported, need new data file - - -## Project 12093 -An entire village was missing, so lots of new buildings. Any buildings -added to OSM after the file at Geofabrik was created. - -### 34976 new buildings - -## Project 12340 - -## Project 12351 - -A round feature is misidentified as a square building. Any buildings -added to OSM after the file at Geofabrik was created. An entire -village was missing, so lots of new buildings. -## 4566 new buildings diff --git a/mkdocs.yml b/mkdocs.yml index f87b258..0ab73cf 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -74,8 +74,11 @@ nav: - General: conflation.md - ODK: odkconflation.md - Highways: highways.md + - Calculations: calculations.md - Utilities: - OSM Merge: osm-merge.md - Utilities: utilities.md + - Data Conversion: + - MVUM: mvum.md + - Trails: trails.md - API: api.md - - Class Hierarchy: apidocs/html/classes.html diff --git a/osm_merge/conflator.py b/osm_merge/conflator.py index ac05910..0e50815 100755 --- a/osm_merge/conflator.py +++ b/osm_merge/conflator.py @@ -24,9 +24,9 @@ from osm_fieldwork.osmfile import OsmFile from geojson import Point, Feature, FeatureCollection, dump, Polygon, load import geojson -from shapely.geometry import shape, LineString, Polygon, mapping +from shapely.geometry import shape, LineString, MultiLineString, Polygon, mapping import shapely -from shapely.ops import transform +from shapely.ops import transform, nearest_points from shapely import wkt from progress.bar import Bar, PixelBar from progress.spinner import PixelSpinner @@ -170,31 +170,36 @@ def conflateThread(primary: list, if dist <= threshold: # slope, angle = cutils.getSlope(entry, existing) angle = 0.0 - slope = cutils.getSlope(entry, existing) - # FIXME: this is a hack till we figure out why a single value - # is returned instead of an array. - if type(slope) == float: - slope = slope + try: + slope, angle = cutils.getSlope(entry, existing) + except: + log.error(f"getSlope() just had a weird error") + # log.debug(f"PRIMARY: {entry["properties"]}") + # log.debug(f"SECONDARY : {existing["properties"]}") + #if entry["properties"]["ref:usfs"] == "FR 550.1": + # breakpoint() + # Probably an exact hit, likely from data imported + # into OSM from the same source. + #geom = maybe[0]["odk"]["geometry"] + hits, tags = cutils.checkTags(entry, existing) + if hits == 0 and abs(angle) == 0.0 and abs(slope) == 0.0: + if len(existing["properties"]) == 1: + log.debug(f"OSM lacks tags for {entry["properties"]["ref:usfs"]}") + + # if hits == 0 and abs(angle) > 3.0 and abs(slope) > 3.0: + if hits == 0 and abs(angle) > 3.0 : + # log.debug(f"Too far away! {angle}") + # breakpoint() continue - else: - if abs(slope[0]) > 2.0 and abs(slope[1]) > 3.0: - continue - # print(f"SLOPE: {abs(slope)}") - log.debug(f"PRIMARY: {entry["properties"]}") - log.debug(f"SECONDARY : {existing["properties"]}") - if dist >= 0.0: - # Probably an exact hit, likely from data imported - # into OSM from the same source. - maybe.append({"dist": dist, "slope": slope, "odk": entry, "osm": existing}) - #geom = maybe[0]["odk"]["geometry"] - hits, tags = cutils.checkTags(maybe[0]["odk"], maybe[0]["osm"]) - tags["hits"] = hits - tags["dist"] = str(dist) - tags["slope"] = str(slope[0]) - tags["angle"] = str(slope[1]) - data.append(Feature(geometry=geom, properties=tags)) - # FIXME: don't process more existing features to - # improve performance. + maybe.append({"dist": dist, "angle": angle, "slope": slope, "odk": entry, "osm": existing}) + tags["hits"] = str(hits) + tags["dist"] = str(dist) + tags["slope"] = str(slope) + tags["angle"] = str(angle) + data.append(Feature(geometry=geom, properties=tags)) + # FIXME: don't process more existing features to + # improve performance. + if dist == 0.0: break # else: # entry["properties"]["version"] = 1 @@ -205,7 +210,7 @@ def conflateThread(primary: list, # cache all OSM features within our threshold distance # These are needed by ODK, but duplicates of other fields, # so they aren't needed and just add more clutter. - log.debug(f"DIST: {dist / 1000}km. {dist}m") + # log.debug(f"DIST: {dist / 1000}km. {dist}m") maybe.append({"dist": dist, "slope": slope, "angle": angle, "hits": hits, "odk": entry, "osm": existing}) # don't keep checking every highway, although testing seems # to show 99% only have one distance match within range. @@ -243,8 +248,10 @@ def conflateThread(primary: list, tags["version"] = maybe[0]["osm"]["properties"]["version"] else: tags["version"] = 1 - tags["dist"] = maybe[0]["dist"] - #tags["slope"] = maybe[0]["slope"] + tags["hits"] = hits + tags["dist"] = str(maybe[0]["dist"]) + tags["slope"] = str(maybe[0]["slope"]) + tags["angle"] = str(maybe[0]["angle"]) data.append(Feature(geometry=geom, properties=tags)) # If no hits, it's new data. ODK data is always just a POI for now else: @@ -299,55 +306,72 @@ def getSlope(self, #old = numpy.array(olddata["geometry"]["coordinates"]) oldline = shape(olddata["geometry"]) angle = 0.0 - newline = LineString() - if newdata["geometry"]["type"] == "MultiLineString": - new = shape(newdata["geometry"]) - - line = shapely.line_merge(new) - - #new = numpy.array(newdata["geometry"]["coordinates"]) newline = shape(newdata["geometry"]) - points = shapely.get_num_points(newline) - offset = 2 - # Get slope of the new line - start = shapely.get_point(newline, offset) - if not start: - return float() - x1 = start.x - y1 = start.y - end = shapely.get_point(newline, points - offset) - x2 = end.x - y2 = end.y - slope1 = (y2 - y1) / (x2 - x1) - - # Get slope of the old line - start = shapely.get_point(oldline, offset) - - if not start: - return float() - x1 = start.x - y1 = start.y - end = shapely.get_point(oldline, shapely.get_num_points(oldline) - offset) - x2 = end.x - y2 = end.y - slope2 = (y2 - y1) / (x2 - x1) - # timer.stop() - slope = slope1 - slope2 - angle = math.degrees(math.atan((slope2-slope1)/(1+(slope2*slope1)))) - if math.isnan(angle): - angle = 0.0 - name1 = "None" - name2 = "None" - if "name" in newdata["properties"]: - name1 = newdata["properties"]["name"] - if "name" in olddata["properties"]: - name2 = olddata["properties"]["name"] - print(f"SLOPE: {slope}: {angle} - {name1} == {name2}") - if math.isnan(slope): - slope = 0.0 - if math.isnan(angle): - angle = 0.0 - return slope, angle + if newline.type == "MultiLineString": + lines = newline.geoms + elif newline.type == "GeometryCollection": + lines = newline.geoms + else: + lines = MultiLineString([newline]).geoms + + bestslope = None + bestangle = None + for segment in lines: + #new = numpy.array(newdata["geometry"]["coordinates"]) + #newline = shape(newdata["geometry"]) + points = shapely.get_num_points(segment) + if points == 0: + return -0.1, -0.1 + # log.debug(f"POINTS: {shapely.get_num_points(segment)} vs {shapely.get_num_points(oldline)}") + offset = 2 + # Get slope of the new line + start = shapely.get_point(segment, offset) + if not start: + return float(), float() + x1 = start.x + y1 = start.y + end = shapely.get_point(segment, points - offset) + x2 = end.x + y2 = end.y + slope1 = (y2 - y1) / (x2 - x1) + + # Get slope of the old line + start = shapely.get_point(oldline, offset) + + if not start: + return float() + x1 = start.x + y1 = start.y + end = shapely.get_point(oldline, shapely.get_num_points(oldline) - offset) + x2 = end.x + y2 = end.y + slope2 = (y2 - y1) / (x2 - x1) + # timer.stop() + slope = slope1 - slope2 + + # Calculate the angle between the linestrings + angle = math.degrees(math.atan((slope2-slope1)/(1+(slope2*slope1)))) + name1 = "None" + name2 = "None" + if math.isnan(angle): + angle = 0.0 + if "name" in newdata["properties"]: + name1 = newdata["properties"]["name"] + if "name" in olddata["properties"]: + name2 = olddata["properties"]["name"] + print(f"SLOPE {len(slopes)}: {slope}: {angle} - {name1} == {name2}") + if math.isnan(slope): + slope = 0.0 + if math.isnan(angle): + angle = 0.0 + # Find the closest segment + if bestangle is None: + best = angle + elif angle < best: + # log.debug(f"BEST: {best} < {dist}") + best = angle + + return slope, best # angle def getDistance(self, newdata: Feature, @@ -377,41 +401,48 @@ def getDistance(self, newobj = transform(project.transform, shape(newdata["geometry"])) oldobj = transform(project.transform, shape(olddata["geometry"])) - # l1 = newdata["geometry"]["coordinates"] - # l2 = olddata["geometry"]["coordinates"] - # m1 = (l1[1][1]-l1[0][1])/(l1[1][0]-l1[0][0]) - # m2 = (l2[1][1]-l2[0][1])/(l2[1][0]-l2[0][0]) - # angle_rad = abs(math.atan(m1) - math.atan(m2)) - # angle_deg = angle_rad*180/PI - # log.debug(f"ANGLE: {angle_rad} : {angle_deg}") - # angle_deg = angle_rad*180/PI - - # log.debug(f"{oldobj} : {newobj}") - # if oldobj.geom_type == "MultiLineString" and newobj.geom_type == "MultiLineString": - # log.debug("MULTILINESTRING!!") - if oldobj.geom_type == "LineString" and newobj.geom_type == "LineString": - # Compare two highways - dist = newobj.distance(oldobj) - elif oldobj.geom_type == "Point" and newobj.geom_type == "LineString": - # We only want to compare LineStrings, so force the distance check - # to be False - dist = 12345678.9 - elif oldobj.geom_type == "Point" and newobj.geom_type == "Point": - dist = newobj.distance(oldobj) - elif oldobj.geom_type == "Polygon" and newobj.geom_type == "Polygon": - # compare two buildings - pass - elif oldobj.geom_type == "Polygon" and newobj.geom_type == "Point": - # Compare a point with a building, used for ODK Collect data - center = shapely.centroid(oldobj) - dist = newobj.distance(center) - elif oldobj.geom_type == "Point" and newobj.geom_type == "LineString": - dist = newobj.distance(oldobj) - elif oldobj.geom_type == "LineString" and newobj.geom_type == "Point": - dist = newobj.distance(oldobj) + if newobj.type == "MultiLineString": + lines = newobj.geoms + elif newobj.type == "GeometryCollection": + lines = newobj.geoms + else: + lines = MultiLineString([newobj]).geoms + + # dists = list() + best = None + for segment in lines: + if oldobj.geom_type == "LineString" and segment.geom_type == "LineString": + # Compare two highways + if oldobj.within(segment): + log.debug(f"CONTAINS") + dist = segment.distance(oldobj) + elif oldobj.geom_type == "Point" and segment.geom_type == "LineString": + # We only want to compare LineStrings, so force the distance check + # to be False + dist = 12345678.9 + elif oldobj.geom_type == "Point" and segment.geom_type == "Point": + dist = segment.distance(oldobj) + elif oldobj.geom_type == "Polygon" and segment.geom_type == "Polygon": + # compare two buildings + pass + elif oldobj.geom_type == "Polygon" and segment.geom_type == "Point": + # Compare a point with a building, used for ODK Collect data + center = shapely.centroid(oldobj) + dist = segment.distance(center) + elif oldobj.geom_type == "Point" and segment.geom_type == "LineString": + dist = segment.distance(oldobj) + elif oldobj.geom_type == "LineString" and segment.geom_type == "Point": + dist = segment.distance(oldobj) + + # Find the closest segment + if best is None: + best = dist + elif dist < best: + # log.debug(f"BEST: {best} < {dist}") + best = dist # timer.stop() - return dist # * 111195 + return best # dist # best def checkTags(self, extfeat: Feature, @@ -428,7 +459,7 @@ def checkTags(self, (int): The number of tag matches (dict): The updated tags """ - match_threshold = 80 + match_threshold = 85 match = ["name", "ref", "ref:usfs"] hits = 0 props = dict() @@ -473,22 +504,36 @@ def checkTags(self, # too so if it isn't a match, use the new name from the # external dataset. if key in osm["properties"] and key in extfeat["properties"]: + # Sometimes there will be a word match, which returns a + # ratio in the low 80s. In that case they should be + # a similar length. + length = len(extfeat["properties"][key]) - len(osm["properties"][key]) ratio = fuzz.ratio(extfeat["properties"][key].lower(), osm["properties"][key].lower()) - if ratio > match_threshold: + if ratio > match_threshold and length <= 3: hits += 1 props["ratio"] = ratio props[key] = extfeat["properties"][key] if ratio != 100: - # if key[:3] == "ref": - # Many minor changes of FS to FR don't require - # caching the exising value as it's only the - # prefix that changed. - # try: - # if osm["properties"][ref][:3] == "FS " and ratio > 80 and ratio < 90 and len(osm["properties"][ref]) == len(extfeat["properties"][ref]): - # log.debug(f"Ignoring old ref {osm["properties"][ref]}") - # continue - # except: - # breakpoint() + # Often the only difference is using FR or FS as the + # prefix. In that case, see if the ref matches. + if key[:3] == "ref": + # This assume all the data has been converted + # by one of the utility programs, which enfore + # using the ref:usfs tag. + tmp = extfeat["properties"]["ref:usfs"].split(' ') + extref = tmp[1].upper() + tmp = osm["properties"]["ref:usfs"].split(' ') + newref = tmp[1].upper() + # log.debug(f"REFS: {extref} vs {newref}: {extref == newref}") + if extref == newref: + hits += 1 + # Many minor changes of FS to FR don't require + # caching the exising value as it's only the + # prefix that changed. It always stayes in this + # range. + if osm["properties"]["ref:usfs"][:3] == "FS " and ratio > 80 and ratio < 90: + log.debug(f"Ignoring old ref {osm["properties"]["ref:usfs"]}") + continue # For a fuzzy match, cache the value from the # secondary dataset and use the value in the # primary dataset. diff --git a/utilities/fixnames.py b/utilities/fixnames.py index f9e5fce..6a6bd40 100755 --- a/utilities/fixnames.py +++ b/utilities/fixnames.py @@ -136,6 +136,13 @@ def main(): tags["ref:usfs"] = f"FR {tmp[2].title()}" matched = True + pat = re.compile(f"usfsr ") + if pat.match(name.lower()) and not matched: + # log.debug(f"MATCHED: {pat.pattern}") + tmp = name.split(' ') + tags["ref:usfs"] = f"FR {tmp[1].title()}" + matched = True + pat = re.compile(f"fs[hr] ") if pat.match(name.lower()) and not matched: # log.debug(f"MATCHED: {pat.pattern}") diff --git a/utilities/mvum.py b/utilities/mvum.py index a32b354..29a023c 100755 --- a/utilities/mvum.py +++ b/utilities/mvum.py @@ -128,10 +128,13 @@ def convert(self, name += f" {word} " if len(name) == 0: name = title + newname = str() if name.find(" Road") <= 0: - props["name"] = f"{name} Road".replace(' ', ' ').strip() + newname f"{name} Road".replace(' ', ' ').strip() else: - props["name"] = name.replace(' ', ' ').strip() + newname = name.replace(' ', ' ').strip() + # the < causes osmium to choke... + props["name"] = newname.replace("<50", "<50") # log.debug(f"NAME: {props["name"]}") # https://www.fs.usda.gov/Internet/FSE_DOCUMENTS/stelprd3793545.pdf diff --git a/utilities/tm-splitter.py b/utilities/tm-splitter.py index ab0327a..c3209f3 100755 --- a/utilities/tm-splitter.py +++ b/utilities/tm-splitter.py @@ -128,7 +128,7 @@ def ogrgrid(outputGridfn: str, if os.path.exists(outputGridfn): os.remove(outputGridfn) outDataSource = outDriver.CreateDataSource(outputGridfn) - outLayer = outDataSource.CreateLayer(outputGridfn,geom_type=ogr.wkbPolygon ) + outLayer = outDataSource.CreateLayer(outputGridfn,geom_type=ogr.wkbMultiPolygon ) featureDefn = outLayer.GetLayerDefn() # create grid cells @@ -250,9 +250,11 @@ async def main(): index += 1 feature["properties"]["name"] = name feature["boundary"] = "administrative" - file = open(f"{name}.geojson", 'w') + outfile = f"{path.parent}/{path.stem}/{path.stem}_{index}.geojson" + file = open(outfile, 'w') geojson.dump(FeatureCollection([feature]), file) file.close() + log.debug(f"Wrote {outfile} ...") elif args.grid: log.debug(f"Generating the grid may take a long time...") path = Path(args.outfile) @@ -275,7 +277,7 @@ async def main(): outdata = driver.CreateDataSource(outfile) if os.path.exists(outfile): os.remove(outfile) - outlayer = outdata.CreateLayer("tasks", geom_type=ogr.wkbMultiPolygon) + outlayer = outdata.CreateLayer("tasks", geom_type=ogr.wkbPolygon) boundary = inlayer.GetNextFeature() poly = boundary.GetGeometryRef()