Skip to content

Commit

Permalink
feat: Tests for invalid data in GTFS dataset (#115)
Browse files Browse the repository at this point in the history
* feat: Tests for invalid data in GTFS dataset

* chore: Add small additional comments

* chore: Add pytest flasgs to tests (runinteg)

* fix: upated testing to better reflect gtfs-kit's behaviour

* chore: Add new flag for gtfs_kit tests -> sanitycheck

* docs: Tweak docstring regarding duplicate service ids

---------

Co-authored-by: r-leyshon <[email protected]>
  • Loading branch information
CBROWN-ONS and r-leyshon authored Sep 20, 2023
1 parent 93bd867 commit 4fe7c03
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 4 deletions.
20 changes: 20 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ def pytest_addoption(parser):
default=False,
help="run expensive tests",
)
parser.addoption(
"--sanitycheck",
action="store_true",
default=False,
help="run sanity checks",
)


def pytest_configure(config):
Expand All @@ -39,6 +45,10 @@ def pytest_configure(config):
config.addinivalue_line(
"markers", "runexpensive: mark test to run expensive tests"
)
config.addinivalue_line(
"markers",
"sanitycheck: mark test to run checks in dependencies' code.",
)


def pytest_collection_modifyitems(config, items): # noqa:C901
Expand All @@ -47,6 +57,7 @@ def pytest_collection_modifyitems(config, items): # noqa:C901
config.getoption("--runsetup")
& config.getoption("--runinteg")
& config.getoption("--runexpensive")
& config.getoption("--sanitycheck")
):
# do full test suite when all flags are given
return
Expand Down Expand Up @@ -75,3 +86,12 @@ def pytest_collection_modifyitems(config, items): # noqa:C901
for item in items:
if "runexpensive" in item.keywords:
item.add_marker(skip_runexpensive)

# do not add sanitycheck marks when the sanitycheck flag is given
if not config.getoption("--sanitycheck"):
skip_sanitycheck = pytest.mark.skip(
reason="need --sanitycheck option to run"
)
for item in items:
if "sanitycheck" in item.keywords:
item.add_marker(skip_sanitycheck)
9 changes: 5 additions & 4 deletions notebooks/gtfs/check_unmatched_id_warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@
feed.trips,
pd.DataFrame(
{
"service_id": [101],
"route_id": [20304],
"service_id": ["101023"],
"route_id": ["2030445"],
"trip_id": ["VJbedb4cfd0673348e017d42435abbdff3ddacbf89"],
"trip_headsign": ["Newport"],
"block_id": [np.nan],
Expand All @@ -70,8 +70,7 @@
feed.routes,
pd.DataFrame(
{
"service_id": [101],
"route_id": [20304],
"route_id": ["20304"],
"agency_id": ["OL5060"],
"route_short_name": ["X145"],
"route_long_name": [np.nan],
Expand All @@ -91,3 +90,5 @@
# the pipeline and therefore the user will be made aware. It is also flagged
# as an error which means that 'the GTFS is violated'
# (https://mrcagney.github.io/gtfs_kit_docs/).

# %%
142 changes: 142 additions & 0 deletions tests/gtfs/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,148 @@ def test_is_valid(self, gtfs_fixture):
found_cols == exp_cols
).all(), f"Expected columns {exp_cols}. Found: {found_cols}"

@pytest.mark.sanitycheck
def test_trips_unmatched_ids(self, gtfs_fixture):
"""Tests to evaluate gtfs-klt's reaction to invalid IDs in trips.
Parameters
----------
gtfs_fixture : GtfsInstance
a GtfsInstance test fixure
"""
feed = gtfs_fixture.feed

# add row to tripas table with invald trip_id, route_id, service_id
feed.trips = pd.concat(
[
feed.trips,
pd.DataFrame(
{
"service_id": ["101023"],
"route_id": ["2030445"],
"trip_id": [
"VJbedb4cfd0673348e017d42435abbdff3ddacbf89"
],
"trip_headsign": ["Newport"],
"block_id": [np.nan],
"shape_id": [
"RPSPc4c99ac6aff7e4648cbbef785f88427a48efa80f"
],
"wheelchair_accessible": [0],
"trip_direction_name": [np.nan],
"vehicle_journey_code": ["VJ109"],
}
),
],
axis=0,
)

# assert different errors/warnings haave been raised
new_valid = feed.validate()
assert (
len(new_valid[new_valid.message == "Undefined route_id"]) == 1
), "gtfs-kit failed to recognise invalid route_id"
assert (
len(new_valid[new_valid.message == "Undefined service_id"]) == 1
), "gtfs-kit failed to recognise invalid service_id"
assert (
len(new_valid[new_valid.message == "Trip has no stop times"]) == 1
), "gtfs-kit failed to recognise invalid service_id"
assert len(new_valid) == 10, "Validation table not expected size"

@pytest.mark.sanitycheck
def test_routes_unmatched_ids(self, gtfs_fixture):
"""Tests to evaluate gtfs-klt's reaction to invalid IDs in routes.
Parameters
----------
gtfs_fixture : GtfsInstance
a GtfsInstance test fixure
"""
feed = gtfs_fixture.feed

# add row to tripas table with invald trip_id, route_id, service_id
feed.routes = pd.concat(
[
feed.routes,
pd.DataFrame(
{
"route_id": ["20304"],
"agency_id": ["OL5060"],
"route_short_name": ["X145"],
"route_long_name": [np.nan],
"route_type": [200],
}
),
],
axis=0,
)

# assert different errors/warnings haave been raised
new_valid = feed.validate()
assert (
len(new_valid[new_valid.message == "Undefined agency_id"]) == 1
), "gtfs-kit failed to recognise invalid agency_id"
assert (
len(new_valid[new_valid.message == "Route has no trips"]) == 1
), "gtfs-kit failed to recognise that there are routes with no trips"
assert len(new_valid) == 9, "Validation table not expected size"

@pytest.mark.sanitycheck
def test_unmatched_service_id_behaviour(self, gtfs_fixture):
"""Tests to evaluate gtfs-klt's reaction to invalid IDs in calendar.
Parameters
----------
gtfs_fixture : GtfsInstance
a GtfsInstance test fixure
Notes
-----
'gtfs-kit' does not care about unmatched service IDs in the calendar
table. The Calendar table can have data with any service_id as long as
the datatypes are string. However, gtfs_kit will state an error if the
calendar table contains duplicate service_ids.
"""
feed = gtfs_fixture.feed
original_error_count = len(feed.validate())

# introduce a dummy row with a non matching service_id
feed.calendar = pd.concat(
[
feed.calendar,
pd.DataFrame(
{
"service_id": ["1018872"],
"monday": [0],
"tuesday": [0],
"wednesday": [0],
"thursday": [0],
"friday": [0],
"saturday": [0],
"sunday": [0],
"start_date": ["20200104"],
"end_date": ["20230301"],
}
),
],
axis=0,
)
new_error_count = len(feed.validate())
assert (
new_error_count == original_error_count
), "Unrecognised error in validaation table"

# drop a row from the calendar table
feed.calendar.drop(3, inplace=True)
new_valid = feed.validate()
assert (
len(new_valid[new_valid.message == "Undefined service_id"]) == 1
), "gtfs-kit failed to identify missing service_id"

def test_print_alerts_defence(self, gtfs_fixture):
"""Check defensive behaviour of print_alerts()."""
with pytest.raises(
Expand Down

0 comments on commit 4fe7c03

Please sign in to comment.