Skip to content

Commit

Permalink
feat: added descriptive stats function
Browse files Browse the repository at this point in the history
  • Loading branch information
ethan-moss committed Nov 14, 2023
1 parent 0375f80 commit c1c421c
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 5 deletions.
98 changes: 98 additions & 0 deletions src/transport_performance/_metrics/tp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import geopandas as gpd
import pandas as pd
import pathlib
import warnings

from haversine import haversine_vector
from typing import Union
Expand Down Expand Up @@ -130,3 +131,100 @@ def _transport_performance_pandas(
).drop([destinations_col], axis=1)

return perf_gdf


def _transport_performance_stats(
tp_df: pd.DataFrame,
urban_centre_name: str = None,
urban_centre_country: str = None,
urban_centre_gdf: gpd.GeoDataFrame = None,
) -> pd.DataFrame:
"""Calculate transport performance descriptive statistics.
Parameters
----------
tp_df : pd.DataFrame
Transport performance dataframe, output from
`_transport_performance_pandas()`, or similar.
urban_centre_name : str, optional
The urban centre name, by default None meaning the name will not be
set.
urban_centre_country: str, optional
The country in which the urban centre resides, by default None meaning
the country will not be set.
urban_centre_gdf : gpd.GeoDataFrame, optional
Output from `UrbanCentre`, containg the urban centre geometry
information. By default None meaning the urban centre area will not be
calcuated.
Returns
-------
pd.DataFrame
Transport performance descriptive statistics.
Raises
------
UserWarning
When the CRS unit of `urban_centre_gdf` is not in meters, and
reporjection is required in order to calculate the urban centre area.
"""
# describe columns to include
DESCRIBE_COLS = ["min", "25%", "50%", "75%", "max"]
UC_AREA_COL = "urban centre area"
UC_COUNTRY_COL = "urban centre country"
UC_NAME_COL = "urban centre name"
UC_LABEL = "vectorized_uc"

# instantiate an output columns list - columns will be inserted here
select_cols = ["urban centre population"]

# get results dataframe and transpose - reset index to drop column name
tp_results = (
pd.DataFrame(tp_df.transport_performance.describe())
.T[DESCRIBE_COLS]
.reset_index(drop=True)
)

# calculate the urban centre area
if urban_centre_gdf is not None:
# copy urban centre geodataframe and set label as axis to simplify
# area calcuation step (only urban centre, not other geometries)
uc = urban_centre_gdf.copy()
uc.set_index("label", inplace=True)

# handle case where CRS is not in an equal area projection
crs_units = uc.crs.axis_info[0].unit_name
uc_crs = uc.crs.to_string()
if crs_units != "metre":
warnings.warn(
f"Unable to calculate the ubran centre area in CRS {uc_crs} "
f"with units {crs_units}. Reprojecting `urban_centre` onto an "
"equal area projection (ESRI:54009, mollweide) for the area "
"calculation step."
)
uc.to_crs("ESRI:54009", inplace=True)

# calculate the urban centre
tp_results[UC_AREA_COL] = uc.loc[UC_LABEL].geometry.area * 1e-6
select_cols.insert(0, UC_AREA_COL)

# add the urban centre country
if urban_centre_country is not None:
tp_results.loc[0, UC_COUNTRY_COL] = urban_centre_country
select_cols.insert(0, UC_COUNTRY_COL)

# add in a name column - do last, such that it is the first column
if urban_centre_name is not None:
tp_results.loc[0, UC_NAME_COL] = urban_centre_name
select_cols.insert(0, UC_NAME_COL)

# calculate the total population
tp_results["urban centre population"] = (
tp_df.population.sum().round().astype(int)
)

# reorder columns to improve readability
tp_results = tp_results[select_cols + DESCRIBE_COLS]

return tp_results
48 changes: 43 additions & 5 deletions src/transport_performance/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@

import os
import pandas as pd
import geopandas as gpd
import pathlib

from glob import glob
from typing import Type, Union
from typing import Optional, Tuple, Type, Union

from transport_performance.population.rasterpop import RasterPop
from transport_performance._metrics.metrics_utils import _retrieve_rasterpop
from transport_performance._metrics.tp_utils import (
_transport_performance_pandas,
_transport_performance_stats,
)
from transport_performance.utils.defence import (
_type_defence,
Expand All @@ -26,7 +28,11 @@ def transport_performance(
sources_col: str = "from_id",
destinations_col: str = "to_id",
backend: str = "pandas",
) -> pd.DataFrame:
descriptive_stats: bool = True,
urban_centre_name: str = None,
urban_centre_country: str = None,
urban_centre_gdf: gpd.GeoDataFrame = None,
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
"""Calculate the transport performance.
Parameters
Expand All @@ -52,11 +58,29 @@ def transport_performance(
backend : str, optional
The 'backend' to use to calculate transport performance, by default
"pandas". Must be one of: {"pandas"}.
descriptive_stats : bool, optional
Calculate transport performance descriptive statistics and return them
in a seperate dataframe. By default True, means descriptive statistics
will be calculated and returned.
urban_centre_name : str, optional
The urban centre name, by default None meaning the name will not be
set. Only considered when `descriptive_stats` is True.
urban_centre_country: str, optional
The country in which the urban centre resides, by default None meaning
the country will not be set. Only considered when `descriptive_stats`
is True.
urban_centre_gdf : gpd.GeoDataFrame, optional
Output from `UrbanCentre`, containg the urban centre geometry
information. By default None meaning the urban centre area will not be
calcuated. Only considered when `descriptive_stats` is True.
Returns
-------
pd.DataFrame
Transport performance metrics, grouped by destination column IDs.
Tuple[pd.DataFrame, Optional[pd.DataFrame]]
The first element of the tuple is the Transport performance metrics
dataframe, grouped by destination column IDs. When `descriptive_stats`
is `True` the second element will be the descriptive statistics output,
otherwise this is `None`.
Raises
------
Expand All @@ -79,6 +103,10 @@ def transport_performance(
"sources_col": [sources_col, str],
"destinations_col": [destinations_col, str],
"backend": [backend, str],
"descriptive_stats": [descriptive_stats, bool],
"urban_centre_name": [urban_centre_name, (type(None), str)],
"urban_centre_country": [urban_centre_country, (type(None), str)],
"urban_centre_gdf": [urban_centre_gdf, (type(None), gpd.GeoDataFrame)],
}
for k, v in type_dict.items():
_type_defence(v[0], k, v[-1])
Expand Down Expand Up @@ -119,4 +147,14 @@ def transport_performance(
f"Got `backend`={backend}. Expected one of: {VALID_TP_BACKENDS}"
)

return tp_df
# handle stats generation, if requested
if descriptive_stats:
stats_df = _transport_performance_stats(
tp_df,
urban_centre_name=urban_centre_name,
urban_centre_country=urban_centre_country,
urban_centre_gdf=urban_centre_gdf,
)
return tp_df, stats_df

return tp_df, None

0 comments on commit c1c421c

Please sign in to comment.