Skip to content

Commit

Permalink
add _function_pipeline(); updaet clean_feed to use;update tests; impl…
Browse files Browse the repository at this point in the history
…ement core cleaner
  • Loading branch information
CBROWN-ONS committed Oct 19, 2023
1 parent 5f97b53 commit ad38cac
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 103 deletions.
10 changes: 5 additions & 5 deletions src/transport_performance/gtfs/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,19 +216,19 @@ def core_cleaners(
_type_defence(drop_zombies, "drop_zombies", bool)
# cleaning
if clean_ids:
clean_ids_gk(gtfs)
clean_ids_gk(gtfs.feed)
if clean_times:
clean_times_gk(gtfs)
clean_times_gk(gtfs.feed)
if clean_route_short_names:
clean_route_short_names_gk(gtfs)
clean_route_short_names_gk(gtfs.feed)
if drop_zombies:
try:
drop_zombies_gk(gtfs)
drop_zombies_gk(gtfs.feed)
except KeyError:
warnings.warn(
UserWarning(
"The drop_zombies cleaner was unable to operate on "
"clean_feed as the trips table ahs no sape_id column"
"clean_feed as the trips table has no shape_id column"
)
)
return None
33 changes: 33 additions & 0 deletions src/transport_performance/gtfs/gtfs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,3 +426,36 @@ def _remove_validation_row(
list(set(gtfs.validity_df.index) - set(index))
]
return None


def _function_pipeline(
gtfs, func_map: dict, operations: Union[dict, type(None)]
) -> None:
"""Iterate through and act on a functional pipeline."""
_gtfs_defence(gtfs, "gtfs")
_type_defence(func_map, "func_map", dict)
_type_defence(operations, "oeprations", (dict, type(None)))
if operations:
for key in operations.keys():
if key not in func_map.keys():
raise KeyError(
f"'{key}' function passed to 'operations' is not a "
"known operation. Known operation include: "
f"{func_map.keys()}"
)
for operation in operations:
# check value is dict or none (for kwargs)
_type_defence(
operations[operation],
f"operations[{operation}]",
(dict, type(None)),
)
operations[operation] = (
{} if operations[operation] is None else operations[operation]
)
func_map[operation](gtfs=gtfs, **operations[operation])
# if no operations passed, carry out all operations
else:
for operation in func_map:
func_map[operation](gtfs=gtfs)
return None
83 changes: 24 additions & 59 deletions src/transport_performance/gtfs/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,15 @@
from typing import Union, Callable
from plotly.graph_objects import Figure as PlotlyFigure

from transport_performance.gtfs.cleaners import (
clean_consecutive_stop_fast_travel_warnings,
clean_multiple_stop_fast_travel_warnings,
)
import transport_performance.gtfs.cleaners as cleaners
import transport_performance.gtfs.validators as gtfs_validators

from transport_performance.gtfs.routes import (
scrape_route_type_lookup,
get_saved_route_type_lookup,
)

from transport_performance.gtfs.gtfs_utils import _function_pipeline
from transport_performance.utils.defence import (
_is_expected_filetype,
_check_namespace_export,
Expand Down Expand Up @@ -382,35 +380,9 @@ def is_valid(self, validators: dict = None) -> pd.DataFrame:
self.validity_df = pd.DataFrame(
columns=["type", "message", "table", "rows"]
)
# carry out additional validators
if validators is not None:
# check all keys are known validators
for key in validators.keys():
if key not in VALIDATE_FEED_FUNC_MAP.keys():
raise KeyError(
f"'{key}' function passed to 'validators' is not a "
"known validator. Known validators include: "
f"{VALIDATE_FEED_FUNC_MAP.keys()}"
)
for validator in validators:
# check value is dict or none (for kwargs)
_type_defence(
validators[validator],
f"validators[{validator}]",
(dict, type(None)),
)
validators[validator] = (
{}
if validators[validator] is None
else validators[validator]
)
VALIDATE_FEED_FUNC_MAP[validator](
gtfs=self, **validators[validator]
)
# if no validators passed, carry out all validators
else:
for validator in VALIDATE_FEED_FUNC_MAP:
VALIDATE_FEED_FUNC_MAP[validator](gtfs=self)
_function_pipeline(
gtfs=self, func_map=VALIDATE_FEED_FUNC_MAP, operations=validators
)
return self.validity_df

def print_alerts(self, alert_type: str = "error") -> None:
Expand Down Expand Up @@ -461,35 +433,27 @@ def print_alerts(self, alert_type: str = "error") -> None:

return None

def clean_feed(
self, validate: bool = False, fast_travel: bool = True
) -> None:
"""Attempt to clean feed using `gtfs_kit`.
def clean_feed(self, cleansers: dict = None) -> None:
"""Clean the gtfs feed.
Parameters
----------
validate: bool, optional
Whether or not to validate the dataframe before cleaning
fast_travel: bool, optional
Whether or not to clean warnings related to fast travel.
cleansers : dict, optional
A mapping of cleansing functions and kwargs, by default None
Returns
-------
None
"""
_type_defence(fast_travel, "fast_travel", bool)
_type_defence(validate, "valiidate", bool)
# TODO: refactor function to be like is_valid()
if validate:
self.is_valid()
try:
# In cases where shape_id is missing, keyerror is raised.
# https://developers.google.com/transit/gtfs/reference#shapestxt
# shows that shapes.txt is optional file.
self.feed = self.feed.clean()
if fast_travel:
clean_consecutive_stop_fast_travel_warnings(self)
clean_multiple_stop_fast_travel_warnings(self)
except KeyError:
# TODO: Issue 74 - Improve this to clean feed when KeyError raised
print("KeyError. Feed was not cleaned.")
# DEV NOTE: Opting not to allow for validation in clean_feed().
# .is_valid() should be used before hand.
# DEV NOTE 2: Use of parm name 'cleansers' is to avoid conflicts
_type_defence(cleansers, "cleansers", (dict, type(None)))
_function_pipeline(
gtfs=self, func_map=CLEAN_FEED_FUNCTION_MAP, operations=cleansers
)
return None

def _produce_stops_map(
self, what_geoms: str, is_filtered: bool, crs: Union[int, str]
Expand Down Expand Up @@ -1514,8 +1478,9 @@ def html_report(
date = datetime.datetime.strftime(datetime.datetime.now(), "%d-%m-%Y")

# feed evaluation
# TODO: make this optional
self.clean_feed(validate=True, fast_travel=True)
# TODO: make this optional (and allow params)
self.is_valid()
self.clean_feed()
# re-validate to clean any newly raised errors/warnings
validation_dataframe = self.is_valid()

Expand Down
45 changes: 44 additions & 1 deletion tests/gtfs/test_gtfs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,18 @@
from plotly.graph_objects import Figure as PlotlyFigure
import numpy as np

from transport_performance.gtfs.validation import GtfsInstance
from transport_performance.gtfs.validation import (
GtfsInstance,
VALIDATE_FEED_FUNC_MAP,
)
from transport_performance.gtfs.gtfs_utils import (
bbox_filter_gtfs,
_add_validation_row,
filter_gtfs_around_trip,
convert_pandas_to_plotly,
_get_validation_warnings,
_remove_validation_row,
_function_pipeline,
)

# location of GTFS test fixture
Expand Down Expand Up @@ -332,3 +336,42 @@ def test__remove_validation_row_on_pass(self):
assert (
len(found_cols) == 0
), "Invalid errors/warnings still in validity_df"


class TestFunctionPipeline(object):
"""Tests for _function_pipeline.
Notes
-----
Not testing on pass here as better cases can be found in the tests for
GtfsInstance's is_valid() and clean_feed() methods.
"""

@pytest.mark.parametrize(
"operations, raises, match",
[
# invalid type for 'validators'
(True, TypeError, ".*expected .*dict.*. Got .*bool.*"),
# invalid validator
(
{"not_a_valid_validator": None},
KeyError,
(
r"'not_a_valid_validator' function passed to 'operations'"
r" is not a known operation.*"
),
),
# invalid type for kwargs for validator
(
{"core_validation": pd.DataFrame()},
TypeError,
".* expected .*dict.*NoneType.*",
),
],
)
def test_function_pipeline_defence(self, operations, raises, match):
"""Defensive test for _function_pipeline."""
gtfs = GtfsInstance(GTFS_FIX_PTH)
with pytest.raises(raises, match=match):
_function_pipeline(gtfs, VALIDATE_FEED_FUNC_MAP, operations)
45 changes: 7 additions & 38 deletions tests/gtfs/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,35 +142,6 @@ def test_get_gtfs_files(self, newp_gtfs_fixture):
), f"GTFS files not as expected. Expected {expected_files},"
"found: {foundf}"

@pytest.mark.parametrize(
"validators, raises, match",
[
# invalid type for 'validators'
(True, TypeError, ".*expected .*dict.*. Got .*bool.*"),
# invalid validator
(
{"not_a_valid_validator": None},
KeyError,
(
r"'not_a_valid_validator' function passed to 'validators'"
r" is not a known validator.*"
),
),
# invalid type for kwargs for validator
(
{"core_validation": pd.DataFrame()},
TypeError,
".* expected .*dict.*NoneType.*",
),
],
)
def test_is_valid_defence(
self, newp_gtfs_fixture, validators, raises, match
):
"""Defensive tests for GtfsInstance.is_valid()."""
with pytest.raises(raises, match=match):
newp_gtfs_fixture.is_valid(validators=validators)

@pytest.mark.parametrize(
"which, validators, shape",
[
Expand Down Expand Up @@ -742,16 +713,14 @@ def dummy_func():
):
newp_gtfs_fixture.summarise_routes(return_summary="true")

@patch("builtins.print")
def test_clean_feed_defence(self, mock_print, newp_gtfs_fixture):
def test_clean_feed_defence(self, newp_gtfs_fixture):
"""Check defensive behaviours of clean_feed()."""
# Simulate condition where shapes.txt has no shape_id
newp_gtfs_fixture.feed.shapes.drop("shape_id", axis=1, inplace=True)
newp_gtfs_fixture.clean_feed()
fun_out = mock_print.mock_calls
assert fun_out == [
call("KeyError. Feed was not cleaned.")
], f"Expected print statement about KeyError. Found: {fun_out}."
with pytest.raises(
TypeError, match=r".*expected .*dict.* Got .*int.*"
):
fixt = newp_gtfs_fixture
fixt.is_valid(validators={"core_validation": None})
fixt.clean_feed(cleansers=1)

def test_summarise_trips_on_pass(self, newp_gtfs_fixture):
"""Assertions about the outputs from summarise_trips()."""
Expand Down

0 comments on commit ad38cac

Please sign in to comment.