datasciencecampus · CBROWN-ONS · Jun 25, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 6, 2024
diff --git a/docs/tutorials/osm/index.qmd b/docs/tutorials/osm/index.qmd
@@ -128,7 +128,7 @@ xmin, ymin, xmax, ymax format. Call the list `BBOX_LIST`.
 BBOX_LIST = [<INSERT_VALUES_HERE>]
 ```
 
-### Exercise
+### Solution
 
 ```{python}
 BBOX_LIST = [-3.002175, 51.587035, -2.994271, 51.59095]
@@ -353,6 +353,14 @@ To read more on `osmosis` filtering strategies, refer to the `completeWays` and
 `completeRelations` flag descriptions in the
 [Osmosis detailed usage documentation](https://wiki.openstreetmap.org/wiki/Osmosis/Detailed_Usage_0.48).
 
+
+Note that additional metadata can be added to the map by setting `include_tags=True`. Adding this rich contextual data to the map can be useful but is also expensive. This operation should be avoided for large osm files, for example anything over 500 KB.
+
+```{python}
+loc_finder.plot_ids(id_finder.id_dict["way_ids"], feature_type="way", include_tags=True)
+
+```
+
 ## Conclusion
 
 Congratulations, you have successfully completed this tutorial on OpenStreetMap

diff --git a/requirements.txt b/requirements.txt
@@ -28,7 +28,7 @@ seaborn
 haversine
 pretty_html_table
 kaleido
-numpy>=1.25.0 # test suite will fail if user installed lower than this
+numpy==1.26.4 # ERROR  - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
 dask[dataframe]
 pyarrow >= 14.0.1 # 14.0.0 has security vulnerability
 osmium # has dependencies on `cmake` and `boost` which require brew install

diff --git a/src/transport_performance/osm/validate_osm.py b/src/transport_performance/osm/validate_osm.py
@@ -18,6 +18,8 @@
 * Find coordinates for node or way features
 * Plot the coordinates of a given list of node or way IDs
 """
+import os
+import warnings
 from pathlib import Path
 from typing import Union
 
@@ -37,6 +39,12 @@
 # ---------utilities-----------
 
 
+class PerformanceWarning(UserWarning):
+    """Operation may be slow."""
+
+    pass
+
+
 def _compile_tags(osmium_feature):
     """Return tag name value pairs.
 
@@ -541,6 +549,16 @@ def __init__(
         _is_expected_filetype(
             osm_pth, "osm_pth", check_existing=True, exp_ext=".pbf"
         )
+        self.large_file_thresh = 50000  # 50 KB
+        # implement performance warning on large OSM files.
+        osm_size = os.path.getsize(osm_pth)
+        if osm_size > self.large_file_thresh:
+            warnings.warn(
+                f"PBF file is {osm_size} bytes. Tag operations are expensive."
+                " Consider filtering the pbf file smaller than"
+                f" {self.large_file_thresh} bytes",
+                PerformanceWarning,
+            )
         tags = tag_collator()
         classnm = tags.__class__.__name__
         if classnm != "_TagHandler":
@@ -614,6 +632,9 @@ class FindLocations:
         Locations of nodes.
     __way_node_locs : dict
         Locations of nodes that belong to a way.
+    _osm_pth : Union[Path, str]
+        Path to osm file on disk. Used for method plot_ids() when include_tags
+        is True.
 
     """
 
@@ -630,6 +651,7 @@ def __init__(
         self.__node_locs = locs.node_locs
         self.__way_node_locs = locs.way_node_locs
         self.found_locs = dict()
+        self._osm_pth = osm_pth
 
     def _check_is_implemented(self, user_feature: str, param_nm: str) -> None:
         """If the requested feature is not node or way, raise."""
@@ -678,11 +700,123 @@ def check_locs_for_ids(self, ids: list, feature_type: str) -> dict:
         )
         return self.found_locs
 
+    def _merge_dicts_retain_dupe_keys(
+        self, dict1: dict, dict2: dict, prepend_pattern: str = "parent_"
+    ) -> dict:
+        """Squish 2 dictionaries while retaining any duplicated keys.
+
+        Update dict1 with key:value pairs from dict2. If duplicated keys are
+        found in dict2, prepend the key with prepend_pattern.
+
+        Parameters
+        ----------
+        dict1 : dict
+            Dictionary of (child or node) tags.
+        dict2 : dict
+            Dictionary of (parent) tags.
+        prepend_pattern : str
+            A string to prepend any duplicated keys in dict_2 with.
+
+        Returns
+        -------
+        dict
+            A merged dictionary, retaining key:value pairs from both.
+
+        """
+        tags_out = {}
+        for d in [dict1, dict2]:
+            if not isinstance(d, dict):
+                raise TypeError(f"Expected dict but found {type(d)}: {d}")
+        for id_, tags in dict1.items():  # child_tags is nested
+            # find duplicated keys and prepend parent keys
+            if dupes := set(tags.keys()).intersection(dict2.keys()):
+                for key in dupes:
+                    dict2[f"{prepend_pattern}{key}"] = dict2.pop(key)
+            # merge parent and child tag collections
+            tags_out[id_] = tags | dict2
+        return tags_out
+
+    def _add_tag_context_to_coord_gdf(  # noqa: C901
+        self, ids: list, feature_type: str, tooltip_nm: str
+    ) -> gpd.GeoDataFrame:
+        """Add a column of tooltips to the coord_gdf attribute.
+
+        Handles node and way features separately.
+
+        Parameters
+        ----------
+        ids : list
+            A list of IDs.
+        feature_type : str
+            "way" or "node".
+        tooltip_nm : str
+            Name of the column to use for the tooltips.
+
+        Returns
+        -------
+        None
+            Updates `coord_gdf` attribute.
+
+        """
+        mapping = {}
+        parent_tags = self.tagfinder.check_tags_for_ids(ids, feature_type)
+        self.coord_gdf[tooltip_nm] = self.coord_gdf.index.to_list()
+        if feature_type == "way":
+            parent_child_mapping = self.coord_gdf.index
+            # Now we have child IDs, we need to run them through FindTags
+            child_tags = self.tagfinder.check_tags_for_ids(
+                [i[-1] for i in parent_child_mapping], feature_type="node"
+            )
+            # add in the parent tag ID to all child tags
+            for k, v in child_tags.items():
+                for t in parent_child_mapping.to_flat_index():
+                    if k == t[-1]:
+                        v["parent_id"] = t[0]
+            # merge the parent way metadata dictionary with the child
+            # metadata dict
+            all_tags = parent_child_mapping.to_series().to_dict()
+            for k, v in parent_tags.items():
+                # iterate over only the children for each parent node
+                for id_ in [i for i in parent_child_mapping if i[0] == k]:
+                    all_tags[id_] = self._merge_dicts_retain_dupe_keys(
+                        {id_[-1]: child_tags[id_[-1]]}, v
+                    )
+            # add combined tags as custom tooltips to coord_gdf. Use map
+            # method to avoid lexsort performance warning
+            for _, v in all_tags.items():
+                for k, val in v.items():
+                    tooltips = [
+                        f"<b>{tag}:</b> {val_}<br>"
+                        for tag, val_ in val.items()
+                    ]
+                    mapping[(val["parent_id"], k)] = "".join(tooltips)
+
+        elif feature_type == "node":
+            for k, val in self.tagfinder.found_tags.items():
+                tooltips = [
+                    f"<b>{tag}:</b> {val_}<br>" for tag, val_ in val.items()
+                ]
+                mapping[k] = "".join(tooltips)
+
+        self.coord_gdf[tooltip_nm] = self.coord_gdf[tooltip_nm].map(mapping)
+        return None
+
     def plot_ids(
         self,
         ids: list,
         feature_type: str,
         crs: Union[str, int] = "epsg:4326",
+        include_tags: bool = False,
+        tooltip_nm: str = "custom_tooltip",
+        tooltip_kwds: dict = {"labels": False},
+        tiles: str = "CartoDB positron",
+        style_kwds: dict = {
+            "color": "#3f5277",
+            "fill": True,
+            "fillOpacity": 0.3,
+            "fillColor": "#3f5277",
+            "weight": 4,
+        },
     ) -> folium.Map:
         """Plot coordinates for nodes or node members of a way.
 
@@ -698,6 +832,26 @@ def plot_ids(
             Whether the type of OSM feature to plot is node or way.
         crs : Union[str, int], optional
             The projection of the spatial features, by default "epsg:4326"
+        include_tags : bool
+            Should tag metadata be included in the map tooltips, by default
+            False
+        tooltip_nm : str
+            Name to use for tooltip column in coord_gdf attribute, by default
+            "custom_tooltip"
+        tooltip_kwds : dict
+            Additional tooltip styling arguments to pass to gpd explore(), by
+            default {"labels": False}
+        tiles : str
+            Basemap provider tiles to use, by default "CartoDB positron"
+        style_kwds : dict
+            Additional map styling arguments to pass to gpd explore(), by
+            default {
+                "color": "#3f5277",
+                "fill": True,
+                "fillOpacity": 0.3,
+                "fillColor": "#3f5277",
+                "weight": 4,
+                }
 
         Returns
         -------
@@ -719,6 +873,7 @@ def plot_ids(
         _type_defence(ids, "ids", list)
         _type_defence(feature_type, "feature_type", str)
         _type_defence(crs, "crs", (str, int))
+        _type_defence(include_tags, "include_tags", bool)
         self._check_is_implemented(
             user_feature=feature_type, param_nm="feature_type"
         )
@@ -728,4 +883,19 @@ def plot_ids(
             feature_type=feature_type,
             crs=crs,
         )
-        return self.coord_gdf.explore()
+        if not include_tags:
+            imap = self.coord_gdf.explore(tiles=tiles, style_kwds=style_kwds)
+        else:
+            # retrieve tags for IDs and add them to self.coord_gdf
+            self.tagfinder = FindTags(self._osm_pth)
+            self._add_tag_context_to_coord_gdf(
+                ids, feature_type, tooltip_nm=tooltip_nm
+            )
+            imap = self.coord_gdf.explore(
+                tooltip=tooltip_nm,
+                tooltip_kwds=tooltip_kwds,
+                tiles=tiles,
+                style_kwds=style_kwds,
+            )
+
+        return imap
diff --git a/tests/osm/test_validate_osm.py b/tests/osm/test_validate_osm.py
@@ -11,6 +11,7 @@
     FindIds,
     FindLocations,
     FindTags,
+    PerformanceWarning,
     _convert_osm_dict_to_gdf,
     _filter_target_dict_with_list,
 )
@@ -321,6 +322,19 @@ def test_find_locations_init(self, _tiny_osm_locs):
             way_len == 2
         ), f"Expected way with length 2, instead found {way_len}"
 
+    def test__merge_dicts_retain_dupe_keys_raises(self, _tiny_osm_locs):
+        """Test internal raises TypeError."""
+        with pytest.raises(
+            TypeError,
+            match=re.escape(
+                "Expected dict but found <class 'list'>: ['not a key', 2]"
+            ),
+        ):
+            _tiny_osm_locs._merge_dicts_retain_dupe_keys(
+                dict1={"some_key": 1},
+                dict2=["not a key", 2],
+            )
+
     def test_check_locs_for_ids(self, _tiny_osm_locs, _tiny_osm_ids):
         """Assert check_locs_for_ids."""
         ids = _tiny_osm_ids
@@ -368,8 +382,42 @@ def test_plot_ids_on_pass(self, _tiny_osm_locs, _tiny_osm_ids):
             ids=ids._FindIds__node_ids[0:1], feature_type="node"
         )
         assert isinstance(plt, folium.Map)
+        plt = locs.plot_ids(
+            ids=ids._FindIds__node_ids[0:1],
+            feature_type="node",
+            include_tags=True,
+        )
+        assert isinstance(plt, folium.Map)
+        # check the tag column is as expected - for nodes, this example should
+        # be empty, nodes often contain no tags, but not always
+        pd.testing.assert_series_equal(
+            locs.coord_gdf["custom_tooltip"],
+            pd.Series([""], index=[7727955], name="custom_tooltip"),
+        )
+        assert locs.coord_gdf["custom_tooltip"].values == [""]
         plt = locs.plot_ids(ids=ids._FindIds__way_ids[0:1], feature_type="way")
         assert isinstance(plt, folium.Map)
+        plt = locs.plot_ids(
+            ids=ids._FindIds__way_ids[0:1],
+            feature_type="way",
+            include_tags=True,
+        )
+        # check the tag column is as expected - for ways, these should always
+        # include at least the parent_id tag.
+        pd.testing.assert_series_equal(
+            locs.coord_gdf["custom_tooltip"],
+            pd.Series(
+                [
+                    "<b>crossing:</b> marked<br><b>highway:</b> crossing<br><b>tactile_paving:</b> yes<br><b>parent_id:</b> 4811009<br><b>lanes:</b> 2<br><b>name:</b> Kingsway<br><b>oneway:</b> yes<br><b>postal_code:</b> NP20<br><b>ref:</b> A4042<br><b>parent_highway:</b> primary<br>",  # noqa E501
+                    "<b>parent_id:</b> 4811009<br><b>lanes:</b> 2<br><b>name:</b> Kingsway<br><b>oneway:</b> yes<br><b>postal_code:</b> NP20<br><b>ref:</b> A4042<br><b>parent_highway:</b> primary<br>",  # noqa E501
+                ],
+                index=pd.MultiIndex.from_tuples(
+                    [(4811009, 7447008812), (4811009, 443158788)],
+                    names=["parent_id", "member_id"],
+                ),
+                name="custom_tooltip",
+            ),
+        )
 
     def test_plot_ids_not_implemented(self, _tiny_osm_locs):
         """Assert asking for relation or area riases not implemented error."""
@@ -419,6 +467,18 @@ def test_find_tags_init(self, _tiny_osm_tags):
         ]
         _class_atttribute_assertions(tags, expected_attrs, expected_methods)
 
+    @pytest.mark.runexpensive
+    def test_find_tags_init_warning(self):
+        """Test that large OSM files trigger a performance warning.
+
+        execution duration c.80 seconds.
+        """
+        with pytest.warns(
+            PerformanceWarning,
+            match=".*Consider filtering the pbf file smaller than 50000 bytes",
+        ):
+            FindTags(here("tests/data/newport-2023-06-13.osm.pbf"))
+
     def test_find_tags_check_tags_for_ids(self, _tiny_osm_tags, _tiny_osm_ids):
         """Test FindTags.check_tags_for_ids()."""
         ids = _tiny_osm_ids