From 6d0f265136311390fabb41d31a22c61581933c44 Mon Sep 17 00:00:00 2001 From: ethan-moss <106239610+ethan-moss@users.noreply.github.com> Date: Wed, 9 Aug 2023 15:42:57 +0100 Subject: [PATCH] 28 preprocess raster pop (#44) * chore: pulled in pop requirements and package CI * feat: added pop class skeletons * feat: added skeleton of plot method * feat: filepath defences; read crs * feat: added read and clip * feat: aoi bounds crs conversion * feat: added rounding preprocess step * feat: added threshold preprocess step * refactor: swapped argument order in get_pop * feat: added NotImplementedError to VectorPop * feat: added vectorize to geopandas * feat: added pop var_name * feat: added cell id * feat: added centroid location and dataframe * refactor: mangaled round var * feat: added within urban centre classification * feat: added interative plot basis * feat: added aoi to interactive plot * feat: save interactive to file * docs: add folium plot docstring * feat: added kwargs to plot folium * refactor: within urban centre var name * feat: plot method logic control added * docs: added attributes and raises to class docstring * feat: added output to get_pop * feat: added matplotlib plot * fix: dropped boundary column after sjoin * feat: added kwargs to matplotlib plot; added kwargs docs help * feat: skelton of cartopy plot func * feat: added save to cartopy plot * feat: added figsave kwarg to cartopy plot * feat: added kwargs to cartopy plot; location of attr modifiable * refactor: popu classes into respective modules * refactor: var_gdf to pop_gdf * test: started TestRasterPop * test: added fixtures for aoi and uc * test: added interl methods test; tweak aoi and uc polygons * refactor: _read_and_clip sets xds internally * test: fixed floating point rounding * test: added _read_and_clip test * refactor: moved round var from private into _to_geopandas() * test: added geopandas conversion pop check * test: added _within_urban_centre asserts * test: centroid gdf crs * chore: added pytest lazy fixture * test: added round and threshold tests * refactor: removed unnecessary exists defence * test: added raises tests * feat: added uc crs conversion * test: aoi and uc crs conversion * chore: removed utils placeholder * test: updated file docstring * test: added light geometry and centroid tests * docs: reviewed docsrings/comments in RasterPop * docs: reviewed docsrings/comments in RasterPop test * chore: Add rioxarray to reqs * refactor: Move copied helper function to src * refcator: Rm unneeded import statement --------- Co-authored-by: r-leyshon --- requirements.txt | 2 + .../population/__init__.py | 4 + .../population/rasterpop.py | 707 ++++++++++++++++++ .../population/vectorpop.py | 14 + src/transport_performance/utils/test_utils.py | 57 ++ tests/population/test_rasterpop.py | 510 +++++++++++++ tests/utils/test_raster.py | 62 +- 7 files changed, 1299 insertions(+), 57 deletions(-) create mode 100644 src/transport_performance/population/__init__.py create mode 100644 src/transport_performance/population/rasterpop.py create mode 100644 src/transport_performance/population/vectorpop.py create mode 100644 src/transport_performance/utils/test_utils.py create mode 100644 tests/population/test_rasterpop.py diff --git a/requirements.txt b/requirements.txt index f0df1e24..c080881f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,5 +13,7 @@ cartopy folium geocube mapclassify +pytest-lazy-fixture seaborn +rioxarray -e . diff --git a/src/transport_performance/population/__init__.py b/src/transport_performance/population/__init__.py new file mode 100644 index 00000000..df4fd1de --- /dev/null +++ b/src/transport_performance/population/__init__.py @@ -0,0 +1,4 @@ +"""`transport_performance.population`. + +See `README.md` for more details. +""" diff --git a/src/transport_performance/population/rasterpop.py b/src/transport_performance/population/rasterpop.py new file mode 100644 index 00000000..be7de416 --- /dev/null +++ b/src/transport_performance/population/rasterpop.py @@ -0,0 +1,707 @@ +"""Class to handle raster population data.""" + +import geopandas as gpd +import os +import numpy as np +import rasterio as rio +import rioxarray +import folium +import cartopy.crs as ccrs +import cartopy.io.img_tiles as cimgt +import matplotlib.pyplot as plt + +from datetime import datetime +from geocube.vector import vectorize +from typing import Union, Type, Tuple +from shapely.geometry.polygon import Polygon +from matplotlib import colormaps +from cartopy.mpl.geoaxes import GeoAxes + + +class RasterPop: + """Prepare raster population inputs for trasport analysis. + + This class is suited to working with rastered population data (e.g. + gridded population estimates). + + Parameters + ---------- + filepath : Union[str, bytes, os.PathLike] + File path to population data + + Attributes + ---------- + pop_gdf : gpd.GeoDataFrame + A geopandas dataframe of the input data, with gridded geometry. This is + in the same CRS as the input raster data. + centroid_gdf + A geopandas dataframe of grid centroids, converted to EPSG:4326 for + transport analysis. + + Methods + ------- + get_pop + Read and preprocess population estimates into a geopandas dataframe. + plot + Build static and interactive visualisations of population data. Can + only use this method once `get_pop` has been called. + + Raises + ------ + FileNotFoundError + When `filepath` is not a file or can not be found. + + """ + + def __init__(self, filepath: Union[str, bytes, os.PathLike]) -> None: + + # defend against cases where input is not a file and does not exist + if not os.path.isfile(filepath): + raise FileNotFoundError(f"{filepath} is not a file.") + + self.__filepath = filepath + + # record the crs of the data source without reading in data + with rio.open(filepath) as src: + self.__crs = src.crs.to_string() + + # set attributes to None to prevent plotting function calls + self.pop_gdf = None + self._uc_gdf = None + + def get_pop( + self, + aoi_bounds: Type[Polygon], + aoi_crs: str = None, + round: bool = False, + threshold: int = None, + var_name: str = "population", + urban_centre_bounds: Type[Polygon] = None, + urban_centre_crs: str = None, + ) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: + """Get population data. + + Parameters + ---------- + aoi_bounds : Type[Polygon] + A shapely polygon defining the boundary of the area of interest. + Assumed to be in the same CRS as the rastered population data. If + it is different, set `aoi_crs` to the CRS of this boundary. + aoi_crs : str, optional + CRS string for `aoi_bounds` (e.g. "EPSG:4326"), by default None + which means it is assumed to have the same CRS as the population + data. + round : bool, optional + Round population estimates to the nearest whole integer. + threshold : int, optional + Threshold population estimates, where values below the set + threshold will be set to nan, by default None which means no + thresholding will occur. + var_name : str, optional + The variable name, by default "population" + urban_centre_bounds : Type[Polygon], optional + Polygon defining an urban centre bounday, by default None meaning + information concerning whether the grid resides within the urban + centre will not be added. + urban_centre_crs : str, optional + The urban centre polygon CRS, by default None meaning this is the + same CRS as the input raster data. Only used when + `urban_centre_bounds` is set. + + Returns + ------- + pop_gdf : gpd.GeoDataFrame + A geopandas dataframe of the input data, with gridded geometry. + This is in the same CRS as the input raster data. + centroid_gdf + A geopandas dataframe of grid centroids, converted to EPSG:4326 for + transport analysis. + + """ + # read and clip population data to area of interest + self._read_and_clip(aoi_bounds, aoi_crs, var_name) + + # round population estimates, if requested + if round: + self._round_population() + + # threshold the population data, if requested + if threshold is not None: + self._threshold_population(threshold) + + # convert to variable and centroid geopandas dataframes + self._to_geopandas(round) + + # add within urban centre details + if urban_centre_bounds is not None: + self._within_urban_centre( + urban_centre_bounds, urban_centre_crs=urban_centre_crs + ) + + return self.pop_gdf, self.centroid_gdf + + def plot( + self, which: str = "folium", save: str = None, **kwargs + ) -> Union[folium.Map, Type[GeoAxes], plt.Axes, None]: + """Plot population data. + + Parameters + ---------- + which : str, optional + Package to use for plotting. Must be one of {"matplotlib", + "cartopy", "folium"}, by default "folium". + save : str, optional + Filepath to save file, with the file extension, by default None + meaning a file will not be saved. + **kwargs + Extra arguments passed to plotting functions to configure the plot + styling. See Notes for more support. + + Returns + ------- + Union[folium.Map, Type[GeoAxes], plt.Axes, None] + A folium map is returned when the `folium` backend is used. A + matplotlib Axes object is returned when the `matplotlib` backend is + used. A matplotlib/cartopy GeoAxes object is return when the + `cartopy` backend is used. None will be returned when saving to + file. + + Raises + ------ + ValueError + Unexpected value of `which`. + NotImplementedError + When plot is called without reading data. + + Notes + ----- + Calling `help` as follows will provide more insights on possible kwarg + arguments for the valid plotting backends: + - Folium backend: `help(RasterPop._plot_folium) + - Matplotlib backend: `help(RasterPop._plot_matplotlib) + - Cartopy backend: `help(RasterPop._plot_cartopy) + + """ + # record of valid which values + WHICH_VALUES = {"matplotlib", "catropy", "folium"} + + # defend against case where `get_pop` hasn't been called + if self.pop_gdf is None: + raise NotImplementedError( + "Unable to call `plot` without calling `get_pop`." + ) + + if which == "folium": + m = self._plot_folium(save, **kwargs) + return m + elif which == "cartopy": + ax = self._plot_cartopy(save, **kwargs) + return ax + elif which == "matplotlib": + ax = self._plot_matplotlib(save, **kwargs) + return ax + else: + raise ValueError( + f"Unrecognised value for `which` {which}. Must be one of " + f"{WHICH_VALUES}." + ) + + def _read_and_clip( + self, + aoi_bounds: Type[Polygon], + aoi_crs: str = None, + var_name: str = "population", + ) -> None: + """Open data and clip to the area of interest boundary. + + Read and clip raster file from disk (more performant). Mask the data + (such that no data values are nan) and read in all grids that touch + the aoi boundary. + + Parameters + ---------- + aoi_bounds : Type[Polygon] + A shapely polygon defining the boundary of the area of interest. + aoi_crs : str, optional + CRS string for `aoi_bounds` (e.g. "EPSG:4326"), by default None + which means it is assumed to have the same CRS as `aoi_bounds`. + var_name : str, optional + The variable name, by default "population". + + """ + # defend against case where aoi_bounds is not a shapely polygon + if not isinstance(aoi_bounds, Polygon): + raise TypeError( + f"Expected type {Polygon.__name__} for `aoi_bounds`, " + f"got {type(aoi_bounds).__name__}." + ) + + # convert aoi bounds CRS if needed + if aoi_crs is not None: + gdf = gpd.GeoDataFrame(geometry=[aoi_bounds], crs=aoi_crs) + gdf = gdf.to_crs(self.__crs) + aoi_bounds = gdf.loc[0, "geometry"] + + # open and clip raster data - using `all_touched` method to include any + # cell touched and `from_disk` for improved reading speeds. + self._xds = rioxarray.open_rasterio( + self.__filepath, masked=True + ).rio.clip([aoi_bounds], from_disk=True, all_touched=True) + + # set the variable name - set internal variable for reuse in class + self._xds.name = var_name + self.__var_name = var_name + + # build aoi bounds dataframe for plotting + self._aoi_gdf = gpd.GeoDataFrame(geometry=[aoi_bounds], crs=self.__crs) + self._aoi_gdf.loc[:, "boundary"] = "AOI Bounds" + + def _round_population(self) -> None: + """Round population data.""" + self._xds = np.rint(self._xds) + + def _threshold_population(self, threshold: int) -> None: + """Threshold population data.""" + # `where()` is working differently to expectation here - it keeps + # values above the threshold and otherwise sets them to nan. + self._xds = self._xds.where(self._xds >= threshold) + + def _to_geopandas(self, round: bool = False) -> None: + """Convert to geopandas dataframe.""" + # vectorise to geopandas dataframe, setting data type to np.float32 - + # dtype is required by vecotrize function. Squeeze needed since shape + # is (1xnxm) (1 band in tiff file) + self.pop_gdf = vectorize(self._xds.squeeze(axis=0).astype(np.float32)) + + # dropna to remove nodata regions and those below threshold (if set) + self.pop_gdf = self.pop_gdf.dropna(subset=self.__var_name).reset_index( + drop=True + ) + + # if rounding, cast to int after removing na values + if round: + self.pop_gdf[self.__var_name] = self.pop_gdf[ + self.__var_name + ].astype("int") + + # add an id for each cell - needed for r5py + self.pop_gdf["id"] = np.arange(0, len(self.pop_gdf.index)) + + # re-order columns for consistency + self.pop_gdf = self.pop_gdf[["id", self.__var_name, "geometry"]] + + # create centroid geodataframe using only necessary columns - save on + # duplicated data, can join gdfs on id if ever needed. + self.centroid_gdf = self.pop_gdf[["id", "geometry"]].copy() + self.centroid_gdf["centroid"] = self.centroid_gdf.geometry.centroid + self.centroid_gdf.set_geometry("centroid", inplace=True) + self.centroid_gdf.drop("geometry", axis=1, inplace=True) + + # convert centroids to EPSG:4326 for use in r5py + if self.__crs != "EPSG:4326": + self.centroid_gdf = self.centroid_gdf.to_crs("EPSG:4326") + + def _within_urban_centre( + self, + urban_centre_bounds: Type[Polygon], + urban_centre_crs: str = None, + ) -> None: + """Categorise grid whether within urban centre. + + Parameters + ---------- + urban_centre_bounds : Type[Polygon] + Polygon defining urban centre bounday + urban_centre_crs : str, optional + The urban centre polygon CRS, by default None meaning this is the + same CRS as the input raster data. + + Raises + ------ + TypeError + When `urban_centre_bounds` is not a shapely Polygon. + + """ + # defend against case where urban_centre_bounds is not a polygon + if not isinstance(urban_centre_bounds, Polygon): + raise TypeError( + f"Expected type {Polygon.__name__} for `urban_centre_bounds`, " + f"got {type(urban_centre_bounds).__name__}." + ) + + # match the crs if is one isn't provided - default assumption if this + # arg is not set, and a CRS is needed when building the gdf below + if urban_centre_crs is None: + urban_centre_crs = self.__crs + + # build urban centre dataframe - set within column to true for sjoin + # such that nan values post join imply no within urban centre. Add an a + # column for the boundary name - only useful for plotting. + self.__UC_COL_NAME = "within_urban_centre" + self._uc_gdf = gpd.GeoDataFrame( + geometry=[urban_centre_bounds], crs=urban_centre_crs + ) + self._uc_gdf.loc[:, self.__UC_COL_NAME] = True + self._uc_gdf.loc[:, "boundary"] = "Urban Centre" + + # if the provided crs does not match the raster crs, then convert the + # crs before the sjoin + if self._uc_gdf.crs != self.__crs: + self._uc_gdf = self._uc_gdf.to_crs(self.__crs) + + # spatial join when cell is within urban centre, filling nan to false. + # drop index_right and boundary columns as they aren't needed. + self.pop_gdf = self.pop_gdf.sjoin( + self._uc_gdf, how="left", predicate="within" + ).drop(["index_right", "boundary"], axis=1) + self.pop_gdf[self.__UC_COL_NAME] = self.pop_gdf[ + self.__UC_COL_NAME + ].fillna(False) + + # add within_urban_centre column to centroid data too - could be useful + # during selecting sources/destinations, so duplication is OK. + self.centroid_gdf = self.centroid_gdf.merge( + self.pop_gdf[["id", self.__UC_COL_NAME]], on="id" + ) + + def _plot_folium( + self, + save: str = None, + tiles: str = ( + "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png" + ), + attr: str = ( + '© OpenStre' + 'etMap contributors © CARTO' + ), + cmap: str = "viridis", + boundary_color: str = "red", + boundary_weight: int = 2, + ) -> folium.Map: + """Plot data onto a folium map. + + Parameters + ---------- + save : str, optional + Filepath to save location, with ".html" extension, by default None + meaning the map will not be saved to file. + tiles : str, optional + Tile layer for base map, by default Carto Positron tiles. As per + folium, the base map can be generated using a built in key words + [1]_ or a custom tileset url [2]_. + attr : str, optional + Tile layer attribution, by default Carto Positron attribution. + cmap : str, optional + A colormap string recognised by `matplotlib` [3]_, by default + "viridis". + boundary_color : str, optional + Color of the boundary lines, by default "red". Could also be a + hexstring. + boundary_weight : float, optional + Weight (in pixels) of the boudary lines, by default 2. + + Returns + ------- + folium.Map + Folium map obeject, with data and boundaries in layers. Will be + None when writing to file (saves time when displaying map + interactive mode). + + Notes + ----- + 1. When using tile layers, check the tile provider's terms and + conditions and assign an attribution as needed. + + References + ---------- + .. [1] https://python-visualization.github.io/folium/modules.html + .. [2] http://leaflet-extras.github.io/leaflet-providers/preview/ + .. [3] https://matplotlib.org/stable/tutorials/colors/colormaps.html + + """ + # add a base map with no tile - then is it not on a layer control + m = folium.Map(tiles=None, control_scale=True, zoom_control=True) + + # add a tile layer + folium.TileLayer( + tiles=tiles, + attr=attr, + show=False, + control=False, + ).add_to(m) + + # add the variable data to the map + self.pop_gdf.explore( + self.__var_name, cmap=cmap, name=self.__var_name.capitalize(), m=m + ) + + # add the urban centre boundary, if one was provided + if self._uc_gdf is not None: + self._uc_gdf.explore( + tooltip=["boundary"], + name="Urban Centre Boundary", + m=m, + style_kwds={ + "fill": False, + "color": boundary_color, + "weight": boundary_weight, + }, + ) + + # add the area of interest boundary + self._aoi_gdf.explore( + tooltip=["boundary"], + name="Area of Interest Boundary", + m=m, + style_kwds={ + "fill": False, + "color": boundary_color, + "weight": boundary_weight, + "dashArray": 7, + }, + ) + + # add the centroids to a separate layer + self.centroid_gdf.explore( + self.__UC_COL_NAME, + name="Centroids", + m=m, + show=False, + style_kwds={ + "style_function": lambda x: { + "color": "#BC544B" + if x["properties"][self.__UC_COL_NAME] is False + else "#8B0000" + } + }, + legend=False, + ) + + # fit bounds to the plot limits and add a layer control button + m.fit_bounds(m.get_bounds()) + m.add_child(folium.LayerControl()) + + # write to file if filepath is given + if save is not None: + out_df = os.path.dirname(save) + if not os.path.exists(out_df): + os.mkdir(out_df) + m.save(save) + m = None + + return m + + def _plot_cartopy( + self, + save: str = None, + figsize: tuple = (10, 8), + map_tile_zoom: int = 12, + cmap: str = "viridis", + cbar_fraction: float = 0.034, + cbar_pad: float = 0.04, + cbar_label: str = "Population count per cell", + var_attr: str = "GHS-POP 2020 (R2023)", + base_map_attr: str = "(C) OpenSteetMap contributors", + attr_font_size: float = 7.5, + attr_location: str = "bottom_left", + ) -> Union[Type[GeoAxes], None]: + """Plot data via cartopy (static plot with an OpenStreetMap base tile). + + Parameters + ---------- + save : str, optional + Filepath to save location, with ".png" extension, by default None + meaning the map will not be saved to file. + figsize : tuple, optional + The matplotlib figure size width and height in inches, by default + (10, 8). + map_tile_zoom : int, optional + The zoom level for the OpenSteetMap base tile layer, by default 12 + which is suitable for towns/cities. See [1]_ for more details on + the zoom level. + cmap : str, optional + A colormap string recognised by `matplotlib` [2]_, by default + "viridis". + cbar_fraction : float, optional + Fractional size of the colour bar. Use this value to change the + height of the colour bar, by default 0.034. + cbar_pad : float, optional + Amount of padding between the plot and the colourbar, by default + 0.04. + cbar_label : str, optional + Colour label, by default "Population count per cell". + var_attr : str, optional + Attribution to assign for variable/population data, by default + "GHS-POP 2020 (R2023)". + base_map_attr : str, optional + Base map tile attribution, by default "(C) OpenSteetMap + contributors". + attr_font_size : float, optional + Font size of the attribution text, by default 7.5. + attr_location : str, optional + Location of the attribution. Must be one of {"bottom_left", + "bottom_right", "top_right", "top_left"}, by default "bottom_left". + + Returns + ------- + Union[Type[GeoAxes], None] + A matplotlib/cartopy GeoAxes displaying the data ontop of an + OpenStreetMap base tile. Returns None when writing to file. + + Raises + ------ + ValueError + When `attr_location` is assigned an unexpected value. + + References + ---------- + .. [1] https://wiki.openstreetmap.org/wiki/Zoom_levels + .. [2] https://matplotlib.org/stable/tutorials/colors/colormaps.html + + """ + # make attribution location dictionary + ATTR_LOC = { + "bottom_left": { + "x": 0.01, + "y": 0.01, + "ha": "left", + "va": "bottom", + }, + "bottom_right": { + "x": 0.99, + "y": 0.01, + "ha": "right", + "va": "bottom", + }, + "top_right": {"x": 0.99, "y": 0.99, "ha": "right", "va": "top"}, + "top_left": {"x": 0.01, "y": 0.99, "ha": "left", "va": "top"}, + } + + # get OpenStreetMap tile layer object in greyscale + map_tile = cimgt.OSM(desired_tile_form="L") + + # build plot axis and add map tile + ax = plt.axes(projection=map_tile.crs) + ax.figure.set_size_inches(*figsize) + data_crs = ccrs.Mollweide() + ax.add_image(map_tile, map_tile_zoom, cmap="gray") + + # build a colormap and add pcolormesh plot data, setting vmin and vmax + # to match the whole colormap range. transform to data_crs required to + # project onto base map. + plot_cmap = colormaps.get_cmap(cmap) + plot_data = self._xds.squeeze(axis=0).to_numpy() + vmin_data = np.nanmin(plot_data) + vmax_data = np.nanmax(plot_data) + ctf = ax.pcolormesh( + self._xds.squeeze().x.to_numpy(), + self._xds.squeeze().y.to_numpy(), + plot_data, + cmap=plot_cmap, + vmin=vmin_data, + vmax=vmax_data, + transform=data_crs, + ) + + # add a colorbar - converting format of smaller numbers to exponents + # modify the y label and reformat the tick axis to show min and max + cbar = plt.colorbar( + ctf, + ax=ax, + fraction=cbar_fraction, + pad=cbar_pad, + format=lambda x, _: f"{x:.0E}" if x <= 1 else f"{x:.0f}", + ) + cbar.ax.set_ylabel(cbar_label, rotation=270, labelpad=20) + cbar.set_ticks( + np.concatenate( + [ + np.array([vmin_data]), + cbar.get_ticks()[1:-1], + np.array([vmax_data]), + ] + ) + ) + + # defend against case attr_location is invalid + if attr_location not in ATTR_LOC.keys(): + raise ValueError( + f"Unrecognised value for `attr_location` {attr_location}." + f"Expecting one of {list(ATTR_LOC.keys())}" + ) + + # create an attribution string and add it to the axis + grid_res = self._xds.rio.resolution() + attribution = ( + f"Generated on: {datetime.strftime(datetime.now(), '%Y-%m-%d')}\n" + f"Population data: {var_attr}\n" + f"Grid Size: {abs(grid_res[0])}m x {abs(grid_res[1])}m\n" + f"Base map: {base_map_attr}" + ) + ax.text( + ATTR_LOC[attr_location]["x"], + ATTR_LOC[attr_location]["y"], + attribution, + transform=ax.transAxes, + size=attr_font_size, + wrap=True, + fontdict={"name": "Arial", "color": "#000000"}, + va=ATTR_LOC[attr_location]["va"], + ha=ATTR_LOC[attr_location]["ha"], + ) + + # use tight layout to maximise axis size of axis + plt.tight_layout() + + # write to file if filepath is given, since there is no figure, need to + # get the current figure and resize it to match the axis before saving + if save is not None: + out_df = os.path.dirname(save) + if not os.path.exists(out_df): + os.mkdir(out_df) + fig = plt.gcf() + fig.set_size_inches(*figsize) + fig.savefig(save) + ax = None + + return ax + + def _plot_matplotlib( + self, save: str = None, figsize: tuple = (6.4, 4.8) + ) -> Union[plt.Axes, None]: + """Plot raster data using matplotlib. + + A shallow wrapper around rioxarray's plot function, to display the + population as a raster using the matplotlib backend. + + Parameters + ---------- + save : str, optional + Filepath to save location, with ".png" extension, by default None + meaning the plot will not be saved to file. + figsize : tuple, optional + The matplotlib figursize width and height in inches, by default + (6.4, 4.8). + + Returns + ------- + Union[plt.Axes, None] + Returns a QuadMesh axis object when not writing to file. When + writing to file, None is return. + + """ + # handle matplotlib and rioxarry steps + fig, ax = plt.subplots(figsize=figsize) + self._xds.plot(ax=ax) + plt.tight_layout() + + # write to file if filepath is given + if save is not None: + out_df = os.path.dirname(save) + if not os.path.exists(out_df): + os.mkdir(out_df) + fig.savefig(save) + ax = None + + return ax diff --git a/src/transport_performance/population/vectorpop.py b/src/transport_performance/population/vectorpop.py new file mode 100644 index 00000000..187dc42a --- /dev/null +++ b/src/transport_performance/population/vectorpop.py @@ -0,0 +1,14 @@ +"""Classes to handle vector population data.""" + + +class VectorPop: + """Prepare vector population inputs for trasport analysis. + + This class is suited to working with vectored population data (e.g. + population data defined within administitive boundaries). + + TODO: add methods and updated documentation. + """ + + def __init__(self) -> None: + raise NotImplementedError("This class has not yet been implemented.") diff --git a/src/transport_performance/utils/test_utils.py b/src/transport_performance/utils/test_utils.py new file mode 100644 index 00000000..9e2dc031 --- /dev/null +++ b/src/transport_performance/utils/test_utils.py @@ -0,0 +1,57 @@ +"""Internal helpers for testing.""" +import numpy as np +import rasterio as rio +import xarray as xr + + +def _np_to_rioxarray( + arr: np.ndarray, + aff: rio.Affine, + as_type: str = "float32", + no_data: int = -200, + crs: str = "ESRI:54009", +) -> xr.DataArray: + """Convert numpy array to rioxarry. + + This function is only used within pytest, as a convinent way to build + fixtures without duplicating code. + + Parameters + ---------- + arr : np.ndarray + Input numpy array + aff : rio.Affine + Affine transform, for input data + as_type : str, optional + Data type, by default "int16" + no_data : int, optional + Value to use for no data, by default -200 + crs : _type_, optional + Coordinate Reference system for input data, by default "ESRI:54009" + + Returns + ------- + xr.DataArray + Input numpy array as an xarray.DataArray with the correct + + """ + # get geometry of input arr and generate col, row indcies + height = arr.shape[0] + width = arr.shape[1] + cols = np.arange(width) + rows = np.arange(height) + + # transform indcies to x, y coordinates + xs, ys = rio.transform.xy(aff, rows, cols) + + # build x_array object + x_array = ( + xr.DataArray(arr.T, dims=["x", "y"], coords=dict(x=xs, y=ys)) + .transpose("y", "x") + .astype(as_type) + .rio.write_nodata(no_data) + .rio.write_transform(aff) + .rio.set_crs(crs) + ) + + return x_array diff --git a/tests/population/test_rasterpop.py b/tests/population/test_rasterpop.py new file mode 100644 index 00000000..5ae249d2 --- /dev/null +++ b/tests/population/test_rasterpop.py @@ -0,0 +1,510 @@ +"""Unit tests for transport_performance/population/rasterpop.py. + +Fixtures used in this file geographically represent a small area over ONS site +in Newport. The data vales do not represent any real or meaningfull measurments +, they are purely for use as a unit test and mimic input data used in this repo +. + +Note: this suite of tests does not cover plotting methods. TODO add plotting +unit tests. See issue #43. +""" + +import os +import pytest +import numpy as np +import rasterio as rio +import xarray as xr +import geopandas as gpd + +from typing import Type, Tuple +from shapely.geometry import Polygon, Point +from numpy.dtypes import Float64DType +from pytest_lazyfixture import lazy_fixture +from _pytest.python_api import RaisesContext + +from transport_performance.population.rasterpop import RasterPop +from transport_performance.utils.test_utils import _np_to_rioxarray + + +# value to test thresholding (removal of populations below this value) +THRESHOLD_TEST = 5 + + +@pytest.fixture +def xarr_1() -> xr.DataArray: + """Create a dummay xarray.DataFrame for RasterPop methods testing.""" + array_1 = np.array( + [ + [1.25, 2.0, 3.75, 4.0], + [5.5, 6.25, 7.5, 8.75], + [9.25, 10.5, 11.0, 12.0], + [13.75, 14.75, 15.25, 16.5], + ] + ) + transform_1 = rio.Affine(100, 0, -225800, 0, -100, 6036800) + xarray_1 = _np_to_rioxarray(array_1, transform_1) + + return xarray_1 + + +@pytest.fixture +def xarr_1_aoi(xarr_1: xr.DataArray) -> Tuple[Type[Polygon], dict]: + """Create dummy aoi polygon for xarr1. + + This is a cross shape pattern, that excludes the 4 corners of the xarr1. + + Parameters + ---------- + xarr_1 : xr.DataArray + Input dummy array, output from `xarr_1` fixture. + + Returns + ------- + Type[Polygon] + A polygon representing a dummy area of interest for xarr_1 + dict + A dictionary of expected results when applying the area of interest + polygon. Keys include: + - "post_clip": state of values after reading and clipping xarr_1. + - "geopandas": expected values of population column in geopandas df. + - "round": expected values following rounding + - "threshold": expected values following thresholding + - "grid": expected grid geometry for test cell + - "centroid": expected centroid coordinate for test cell. + + """ + coords = ( + (-225650, 6036750), + (-225650, 6036650), + (-225750, 6036650), + (-225750, 6036550), + (-225650, 6036550), + (-225650, 6036450), + (-225550, 6036450), + (-225550, 6036500), + (-225500, 6036500), + (-225500, 6036550), + (-225450, 6036550), + (-225450, 6036650), + (-225500, 6036650), + (-225500, 6036700), + (-225550, 6036700), + (-225550, 6036750), + (-225650, 6036750), + ) + + # build a dictionary to store expected results (to use during asserts) + expected = {} + + # expected result after reading and clipping the array + # set nan values in corners to match post aoi clipping expectations + # add extra dimension to match reading of a band in rioxarray + exp_post_clip = np.copy(xarr_1.to_numpy()) + exp_post_clip[0, 0] = np.nan + exp_post_clip[0, -1] = np.nan + exp_post_clip[-1, 0] = np.nan + exp_post_clip[-1, -1] = np.nan + exp_post_clip = np.expand_dims(exp_post_clip, axis=0) + + # expected results after converting to geopandas dataframe + # flatten to get 1-d array, and remove nans as per _to_geopandas() + exp_gpd = exp_post_clip.flatten() + exp_gpd = exp_gpd[~np.isnan(exp_gpd)] + + # extended results following rounding. Note: xarray/numpy round functions + # use round half to even method, so 10.5 gets rounded down to 10. Also + # 6 and 15 aren't repeated since geocube combines neighbouring cells with + # identical values into a larger polygon + exp_round = np.array([2, 4, 6, 8, 9, 9, 10, 11, 12, 15]) + + # expected result are setting a threshold of THRESHOLD_TEST + exp_threshold = exp_gpd[exp_gpd >= THRESHOLD_TEST] + + # test coords of one cell. light unit test since `geocube` unit tests the + # `vectorize` function here: https://github.com/corteva/geocube/blob/master + # /test/integration/test_vector.py#L10 + test_grid_idx = 1 + test_grid_coords = ( + (-225600, 6036800), + (-225600, 6036700), + (-225500, 6036700), + (-225500, 6036800), + (-225600, 6036800), + ) + test_grid_polygon = Polygon(test_grid_coords) + exp_test_grid = {"idx": test_grid_idx, "polygon": test_grid_polygon} + + # test a single centroid. light unit test since `geopandas` unit tests the + # `centroid` function here: https://github.com/geopandas/geopandas/blob/b4b + # 10313ab57bf2c55592a28fb99687c9a538fc2/geopandas/tests/test_geom_methods.p + # y#L778 + test_centoid_idx = 7 + test_centoid_coord = (-3.0300622961483463, 51.566809602591896) + test_centroid_point = Point(test_centoid_coord) + exp_test_centroid = {"idx": test_centoid_idx, "point": test_centroid_point} + + # updated the dictionary for returning + expected["post_clip"] = exp_post_clip + expected["geopandas"] = exp_gpd + expected["round"] = exp_round + expected["threshold"] = exp_threshold + expected["grid"] = exp_test_grid + expected["centroid"] = exp_test_centroid + + return Polygon(coords), expected + + +@pytest.fixture +def xarr_1_uc() -> Tuple[Type[Polygon], dict]: + """Create dummy urban centre polygon for xarr1. + + This is a rectangular pattern encompassing the 2 right hand central grids + of xarr1. + + Returns + ------- + Type[Polygon] + A polygon representing a dummy urban centre for xarr_1 + dict + A dictionary of expected results when applying the area of interest + polygon. Keys include: + - "within_uc": expected statuses (booleans) showing which grids are + within the dummy urban centre. + + """ + coords = ( + (-225600, 6036700), + (-225600, 6036500), + (-225500, 6036500), + (-225500, 6036700), + (-225600, 6036700), + ) + + # build a dictionary to store expected results (to use during asserts) + expected = {} + + # build an array of booleans indicating which cells are within the UC + exp_within_uc = np.array( + [ + False, + False, + False, + False, + True, + False, + False, + False, + True, + False, + False, + False, + ] + ) + + # update expected dictionary for returning + expected["within_uc"] = exp_within_uc + + return Polygon(coords), expected + + +@pytest.fixture +def xarr_1_fpath(xarr_1: xr.DataArray, tmp_path: str) -> str: + """Build temp directory to store dummy data to test read methods. + + Parameters + ---------- + xarr_1 : xr.DataArray + Dummy input data from `xarr_1` pytest fixture. + tmp_path : str + Temporary directory to use for pytest run. + + Returns + ------- + out_filepath : str + Filepath to dummy GeoTIFF data. + + """ + # create dir to write dummy data within the temp dir + write_dir = os.path.join(tmp_path, "test_file") + os.mkdir(write_dir) + + # write dummy data to the dir within the temp dir + out_filepath = os.path.join(write_dir, "input.tif") + xarr_1.rio.to_raster(out_filepath) + + return out_filepath + + +@pytest.fixture +def xarr_1_4326( + xarr_1_aoi: tuple, xarr_1_uc: tuple +) -> Tuple[Polygon, Polygon]: + """Build AOI and urban centre polygons in EPSG:4326. + + Parameters + ---------- + xarr_1_aoi : tuple + Output from `xarr_1_aoi`. + xarr_1_uc : tuple + Output from `xarr_1_uc`. + + Returns + ------- + Tuple[Polygon, Polygon] + A tuple of polygons in EPSG:4326. At the 0th index is the AOI and at + the 1st index is the urban centre. + + Note + ---- + The dummy urban centre boundary used in these unit tests is set to the + extremites of 2 cells (representive of real example). Conversion to + EPSG:4325 introduces floating point/rounding errors. This results in + the reconversion back to mollweide (the step being unit tested) is no + longer the precise extremeties. This means the grids are no longer with the + bounds of the re-converted urban centre geometry. Therefore, only for this + unit test, a buffer is applied to the urban centre before conversion (just + 1m). This is enough to negate the floating point/rounding errors and bring + the grids back within the converted urban centre boundary. This step is + not needed for the area of interest since floating point/round errors can + not lead to a conflicting scenario since there is always sufficent overlap + into the cells. + + """ + # convert aoi to EPSG:4326 + aoi_4326_gdf = gpd.GeoDataFrame( + geometry=[xarr_1_aoi[0]], crs="ESRI:54009" + ).to_crs("EPSG:4326") + + # convert uc to EPSG:4326 - see note for more details. + uc_4326_gdf = gpd.GeoDataFrame(geometry=[xarr_1_uc[0]], crs="ESRI:54009") + uc_4326_gdf["geometry"] = uc_4326_gdf.geometry.buffer(1, join_style=2) + uc_4326_gdf = uc_4326_gdf.to_crs("EPSG:4326") + + return aoi_4326_gdf.loc[0, "geometry"], uc_4326_gdf.loc[0, "geometry"] + + +class TestRasterPop: + """A class to test population.RasterPop methods.""" + + def test__raster_pop_internal_methods( + self, + xarr_1_fpath: str, + xarr_1_aoi: tuple, + xarr_1_uc: tuple, + ) -> None: + """Test all the internal methods of the RasterPop call. + + Test the performance of the base internal methods. This test is written + in this way since the internal methods are dependent on one another, + and this permits unit testing of each internal stage. + + Parameters + ---------- + xarr_1_fpath : str + Filepath to dummy data GeoTIFF file. Output from `xarr_1_fpath` + fixture. + xarr_1_aoi : tuple + A tuple containing the area of interest polygon for xarr_1 and a + dictionary of expected method outputs after applying the area of + interest polygon. For more information on the valid keys of this + dictionary see the docstring of the `xarr_1_aoi` fixture. + xarr_1_uc : tuple + A tuple containing the urban centre polygon for xarr_1 and a + dictionary of expected method outputs after applying the urban + centre polygon. For more information on the valid keys of this + dictionary see the docstring of the `xarr_1_uc` fixture. + + """ + # print fpath where input data resides in tmp folder + # useful when using -rP flag in pytest to see directory + print(f"Temp file path for tif input: {xarr_1_fpath}") + + rp = RasterPop(xarr_1_fpath) + + # call and test read and clip method asserting post clip expectation + rp._read_and_clip(aoi_bounds=xarr_1_aoi[0]) + assert np.array_equal( + rp._xds.to_numpy(), xarr_1_aoi[1]["post_clip"], equal_nan=True + ) + + # call and test _to_geopandas and assert to geopandas expectations + rp._to_geopandas() + assert np.array_equal( + rp.pop_gdf.population, xarr_1_aoi[1]["geopandas"] + ) + assert isinstance(rp.pop_gdf.population.dtype, Float64DType) + assert ( + rp.pop_gdf.geometry.iloc[xarr_1_aoi[1]["grid"]["idx"]] + == xarr_1_aoi[1]["grid"]["polygon"] + ) + assert rp.centroid_gdf.crs == "EPSG:4326" + assert ( + rp.centroid_gdf.geometry.iloc[xarr_1_aoi[1]["centroid"]["idx"]] + == xarr_1_aoi[1]["centroid"]["point"] + ) + + # call and test _within_urban_centre + rp._within_urban_centre(xarr_1_uc[0]) + assert "within_urban_centre" in rp.pop_gdf.columns + assert "within_urban_centre" in rp.centroid_gdf.columns + assert np.array_equal( + rp.pop_gdf.within_urban_centre, xarr_1_uc[1]["within_uc"] + ) + assert np.array_equal( + rp.centroid_gdf.within_urban_centre, xarr_1_uc[1]["within_uc"] + ) + + @pytest.mark.parametrize( + "round, threshold, expected, key", + [ + (True, None, lazy_fixture("xarr_1_aoi"), "round"), + (False, THRESHOLD_TEST, lazy_fixture("xarr_1_aoi"), "threshold"), + ], + ) + def test_rasterpop( + self, + round: bool, + threshold: int, + expected: tuple, + key: str, + xarr_1_fpath: str, + xarr_1_aoi: tuple, + xarr_1_uc: tuple, + var_name: str = "test_var_name", + ) -> None: + """Test RasterPop core functionalities. + + Parameters + ---------- + round : bool + Flag to set rounding pre-processing stage. Round population when + then flag is true. + threshold : int + Threshold to set for removing population grids below a minimum + level. None implies no thresholding will occur. + expected : tuple + Expected result to assert against. Defined and build inside the + `xarr_1_aoi` fixture. + key : str + Key to use in `xarr_1_aoi` dict output, to assert against. + xarr_1_fpath : str + Filepath to dummy data + xarr_1_aoi : tuple + Dummy area of interest. Output from `xarr_1_aoi` fixture. + xarr_1_uc : tuple + Dummy urban centre fixture. Output from `xarr_1_uc` fixture. + var_name : str, optional + Name of variable in raster file, by default "test_var_name" to + test this functionality of renaming the variable of interest. + + """ + rp = RasterPop(xarr_1_fpath) + + # get the population data for the dummy aoi and uc + pop_gdf, _ = rp.get_pop( + aoi_bounds=xarr_1_aoi[0], + round=round, + threshold=threshold, + var_name=var_name, + urban_centre_bounds=xarr_1_uc[0], + ) + + # assert the population values are as expected + assert np.array_equal(pop_gdf[var_name], expected[1][key]) + + @pytest.mark.parametrize( + "fpath, aoi_bounds, urban_centre_bounds, expected", + [ + ("test.tif", None, None, pytest.raises(FileNotFoundError)), + ( + lazy_fixture("xarr_1_fpath"), + ("test"), + None, + pytest.raises(TypeError), + ), + ( + lazy_fixture("xarr_1_fpath"), + lazy_fixture("xarr_1_aoi"), + "test", + pytest.raises(TypeError), + ), + ], + ) + def test_rasterpop_raises( + self, + fpath: str, + aoi_bounds: Tuple[str], + urban_centre_bounds: str, + expected: Type[RaisesContext], + ) -> None: + """Test raises statements in RasterPop. + + Parameters + ---------- + fpath : str + Filepath to dummy data. + aoi_bounds : Tuple[str] + Area of interest bounds, as a tuple where the 0th index is used ( + for consistency with the `xarr1_aoi` fixture). + urban_centre_bounds : str + Urban centre bounds test string. + expected : Type[RaisesContext] + Expected raise result. + + Note + ---- + The use of strings to compare with expected Polygon types is not + completely thourough, but is a basic unit test to ensure the raises + is thrown when there is a type mismatch. + + """ + # trigger the raise and ensure it matches the expected type. + with expected: + rp = RasterPop(fpath) + rp.get_pop( + aoi_bounds=aoi_bounds[0], + urban_centre_bounds=urban_centre_bounds, + ) + + def test_rasterpop_crs_conversion( + self, + xarr_1_fpath: str, + xarr_1_4326: tuple, + xarr_1_aoi: tuple, + xarr_1_uc: tuple, + ): + """Test the AOI and Urban Centre CRS conversion steps. + + Check population values and within urban centre categorisation + remains consistent dispite providing inputs in EPSG:4326. + + Parameters + ---------- + xarr_1_fpath : str + Filepath to dummy data + xarr_1_4326 : tuple + Area of interest and urban centre polygons in "EPSG:4326". Output + from the `xarr_1_4326` fixture. + xarr_1_aoi : tuple + Output from the `xarr_1_aoi` fixture. Contains expected results + to use for assertion (checking pop values remain consistent). + xarr_1_uc : tuple + _Output from the `xarr_1_uc` fixture. Contains expected results to + use for assertion (checking within urban centre bools are + consistent). + + """ + # instantiate RasterPop with AOI and UC in EPSG:4326 + rp = RasterPop(xarr_1_fpath) + pop_gdf, _ = rp.get_pop( + aoi_bounds=xarr_1_4326[0], + aoi_crs="EPSG:4326", + urban_centre_bounds=xarr_1_4326[1], + urban_centre_crs="EPSG:4326", + ) + + # check results remain consistent + assert np.array_equal(pop_gdf.population, xarr_1_aoi[1]["geopandas"]) + assert np.array_equal( + pop_gdf.within_urban_centre, xarr_1_uc[1]["within_uc"] + ) diff --git a/tests/utils/test_raster.py b/tests/utils/test_raster.py index fdd950db..249ede0d 100644 --- a/tests/utils/test_raster.py +++ b/tests/utils/test_raster.py @@ -19,59 +19,7 @@ merge_raster_files, sum_resample_file, ) - - -def np_to_rioxarry( - arr: np.ndarray, - aff: rio.Affine, - as_type: str = "float32", - no_data: int = -200, - crs: str = "ESRI:54009", -) -> xr.DataArray: - """Convert numpy array to rioxarry. - - This function is only used within pytest, as a convinent way to build - fixtures without duplicating code. - - Parameters - ---------- - arr : np.ndarray - Input numpy array - aff : rio.Affine - Affine transform, for input data - as_type : str, optional - Data type, by default "int16" - no_data : int, optional - Value to use for no data, by default -200 - crs : _type_, optional - Coordinate Reference system for input data, by default "ESRI:54009" - - Returns - ------- - xr.DataArray - Input numpy array as an xarray.DataArray with the correct - - """ - # get geometry of input arr and generate col, row indcies - height = arr.shape[0] - width = arr.shape[1] - cols = np.arange(width) - rows = np.arange(height) - - # transform indcies to x, y coordinates - xs, ys = rio.transform.xy(aff, rows, cols) - - # build x_array object - x_array = ( - xr.DataArray(arr.T, dims=["x", "y"], coords=dict(x=xs, y=ys)) - .transpose("y", "x") - .astype(as_type) - .rio.write_nodata(no_data) - .rio.write_transform(aff) - .rio.set_crs(crs) - ) - - return x_array +from transport_performance.utils.test_utils import _np_to_rioxarray @pytest.fixture @@ -84,7 +32,7 @@ def merge_xarr_1(): [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]] ) transform_1 = rio.Affine(100, 0, -225800, 0, -100, 6036800) - xarray_1 = np_to_rioxarry(array_1, transform_1) + xarray_1 = _np_to_rioxarray(array_1, transform_1) return xarray_1 @@ -97,7 +45,7 @@ def merge_xarr_2(): """ array_2 = np.random.randn(4, 4) transform_2 = rio.Affine(100, 0, -225400, 0, -100, 6036800) - xarray_2 = np_to_rioxarry(array_2, transform_2, as_type="float32") + xarray_2 = _np_to_rioxarray(array_2, transform_2, as_type="float32") return xarray_2 @@ -110,7 +58,7 @@ def merge_xarr_3(): """ array_3 = np.random.randn(4, 4) transform_3 = rio.Affine(100, 0, -225800, 0, -100, 6036400) - xarray_3 = np_to_rioxarry(array_3, transform_3, as_type="float32") + xarray_3 = _np_to_rioxarray(array_3, transform_3, as_type="float32") return xarray_3 @@ -123,7 +71,7 @@ def merge_xarr_4(): """ array_4 = np.random.randn(4, 4) transform_4 = rio.Affine(100, 0, -225400, 0, -100, 6036400) - xarray_4 = np_to_rioxarry(array_4, transform_4, as_type="float32") + xarray_4 = _np_to_rioxarray(array_4, transform_4, as_type="float32") return xarray_4