From 4fe7c033f6b1144995119aa41f8da90c129bd59c Mon Sep 17 00:00:00 2001 From: Charlie Browning <121952297+CBROWN-ONS@users.noreply.github.com> Date: Wed, 20 Sep 2023 14:10:41 +0100 Subject: [PATCH] feat: Tests for invalid data in GTFS dataset (#115) * feat: Tests for invalid data in GTFS dataset * chore: Add small additional comments * chore: Add pytest flasgs to tests (runinteg) * fix: upated testing to better reflect gtfs-kit's behaviour * chore: Add new flag for gtfs_kit tests -> sanitycheck * docs: Tweak docstring regarding duplicate service ids --------- Co-authored-by: r-leyshon --- conftest.py | 20 +++ notebooks/gtfs/check_unmatched_id_warnings.py | 9 +- tests/gtfs/test_validation.py | 142 ++++++++++++++++++ 3 files changed, 167 insertions(+), 4 deletions(-) diff --git a/conftest.py b/conftest.py index 6f644831..191d886e 100644 --- a/conftest.py +++ b/conftest.py @@ -28,6 +28,12 @@ def pytest_addoption(parser): default=False, help="run expensive tests", ) + parser.addoption( + "--sanitycheck", + action="store_true", + default=False, + help="run sanity checks", + ) def pytest_configure(config): @@ -39,6 +45,10 @@ def pytest_configure(config): config.addinivalue_line( "markers", "runexpensive: mark test to run expensive tests" ) + config.addinivalue_line( + "markers", + "sanitycheck: mark test to run checks in dependencies' code.", + ) def pytest_collection_modifyitems(config, items): # noqa:C901 @@ -47,6 +57,7 @@ def pytest_collection_modifyitems(config, items): # noqa:C901 config.getoption("--runsetup") & config.getoption("--runinteg") & config.getoption("--runexpensive") + & config.getoption("--sanitycheck") ): # do full test suite when all flags are given return @@ -75,3 +86,12 @@ def pytest_collection_modifyitems(config, items): # noqa:C901 for item in items: if "runexpensive" in item.keywords: item.add_marker(skip_runexpensive) + + # do not add sanitycheck marks when the sanitycheck flag is given + if not config.getoption("--sanitycheck"): + skip_sanitycheck = pytest.mark.skip( + reason="need --sanitycheck option to run" + ) + for item in items: + if "sanitycheck" in item.keywords: + item.add_marker(skip_sanitycheck) diff --git a/notebooks/gtfs/check_unmatched_id_warnings.py b/notebooks/gtfs/check_unmatched_id_warnings.py index 21c064c4..6e0c4f1d 100644 --- a/notebooks/gtfs/check_unmatched_id_warnings.py +++ b/notebooks/gtfs/check_unmatched_id_warnings.py @@ -46,8 +46,8 @@ feed.trips, pd.DataFrame( { - "service_id": [101], - "route_id": [20304], + "service_id": ["101023"], + "route_id": ["2030445"], "trip_id": ["VJbedb4cfd0673348e017d42435abbdff3ddacbf89"], "trip_headsign": ["Newport"], "block_id": [np.nan], @@ -70,8 +70,7 @@ feed.routes, pd.DataFrame( { - "service_id": [101], - "route_id": [20304], + "route_id": ["20304"], "agency_id": ["OL5060"], "route_short_name": ["X145"], "route_long_name": [np.nan], @@ -91,3 +90,5 @@ # the pipeline and therefore the user will be made aware. It is also flagged # as an error which means that 'the GTFS is violated' # (https://mrcagney.github.io/gtfs_kit_docs/). + +# %% diff --git a/tests/gtfs/test_validation.py b/tests/gtfs/test_validation.py index 62ea81aa..9c7e65ac 100644 --- a/tests/gtfs/test_validation.py +++ b/tests/gtfs/test_validation.py @@ -117,6 +117,148 @@ def test_is_valid(self, gtfs_fixture): found_cols == exp_cols ).all(), f"Expected columns {exp_cols}. Found: {found_cols}" + @pytest.mark.sanitycheck + def test_trips_unmatched_ids(self, gtfs_fixture): + """Tests to evaluate gtfs-klt's reaction to invalid IDs in trips. + + Parameters + ---------- + gtfs_fixture : GtfsInstance + a GtfsInstance test fixure + + """ + feed = gtfs_fixture.feed + + # add row to tripas table with invald trip_id, route_id, service_id + feed.trips = pd.concat( + [ + feed.trips, + pd.DataFrame( + { + "service_id": ["101023"], + "route_id": ["2030445"], + "trip_id": [ + "VJbedb4cfd0673348e017d42435abbdff3ddacbf89" + ], + "trip_headsign": ["Newport"], + "block_id": [np.nan], + "shape_id": [ + "RPSPc4c99ac6aff7e4648cbbef785f88427a48efa80f" + ], + "wheelchair_accessible": [0], + "trip_direction_name": [np.nan], + "vehicle_journey_code": ["VJ109"], + } + ), + ], + axis=0, + ) + + # assert different errors/warnings haave been raised + new_valid = feed.validate() + assert ( + len(new_valid[new_valid.message == "Undefined route_id"]) == 1 + ), "gtfs-kit failed to recognise invalid route_id" + assert ( + len(new_valid[new_valid.message == "Undefined service_id"]) == 1 + ), "gtfs-kit failed to recognise invalid service_id" + assert ( + len(new_valid[new_valid.message == "Trip has no stop times"]) == 1 + ), "gtfs-kit failed to recognise invalid service_id" + assert len(new_valid) == 10, "Validation table not expected size" + + @pytest.mark.sanitycheck + def test_routes_unmatched_ids(self, gtfs_fixture): + """Tests to evaluate gtfs-klt's reaction to invalid IDs in routes. + + Parameters + ---------- + gtfs_fixture : GtfsInstance + a GtfsInstance test fixure + + """ + feed = gtfs_fixture.feed + + # add row to tripas table with invald trip_id, route_id, service_id + feed.routes = pd.concat( + [ + feed.routes, + pd.DataFrame( + { + "route_id": ["20304"], + "agency_id": ["OL5060"], + "route_short_name": ["X145"], + "route_long_name": [np.nan], + "route_type": [200], + } + ), + ], + axis=0, + ) + + # assert different errors/warnings haave been raised + new_valid = feed.validate() + assert ( + len(new_valid[new_valid.message == "Undefined agency_id"]) == 1 + ), "gtfs-kit failed to recognise invalid agency_id" + assert ( + len(new_valid[new_valid.message == "Route has no trips"]) == 1 + ), "gtfs-kit failed to recognise that there are routes with no trips" + assert len(new_valid) == 9, "Validation table not expected size" + + @pytest.mark.sanitycheck + def test_unmatched_service_id_behaviour(self, gtfs_fixture): + """Tests to evaluate gtfs-klt's reaction to invalid IDs in calendar. + + Parameters + ---------- + gtfs_fixture : GtfsInstance + a GtfsInstance test fixure + + Notes + ----- + 'gtfs-kit' does not care about unmatched service IDs in the calendar + table. The Calendar table can have data with any service_id as long as + the datatypes are string. However, gtfs_kit will state an error if the + calendar table contains duplicate service_ids. + + """ + feed = gtfs_fixture.feed + original_error_count = len(feed.validate()) + + # introduce a dummy row with a non matching service_id + feed.calendar = pd.concat( + [ + feed.calendar, + pd.DataFrame( + { + "service_id": ["1018872"], + "monday": [0], + "tuesday": [0], + "wednesday": [0], + "thursday": [0], + "friday": [0], + "saturday": [0], + "sunday": [0], + "start_date": ["20200104"], + "end_date": ["20230301"], + } + ), + ], + axis=0, + ) + new_error_count = len(feed.validate()) + assert ( + new_error_count == original_error_count + ), "Unrecognised error in validaation table" + + # drop a row from the calendar table + feed.calendar.drop(3, inplace=True) + new_valid = feed.validate() + assert ( + len(new_valid[new_valid.message == "Undefined service_id"]) == 1 + ), "gtfs-kit failed to identify missing service_id" + def test_print_alerts_defence(self, gtfs_fixture): """Check defensive behaviour of print_alerts().""" with pytest.raises(