datasciencecampus · ethan-moss · Aug 16, 2023 · Jun 21, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/.gitignore b/.gitignore
@@ -2,11 +2,13 @@
 **/*.pbf
 **/*.mapdb
 **/*.mapdb.p
-# moved zip blanket rule above specific exception for test fixture below
+# moved blanket rules above specific exceptions for test fixtures
 *.zip
+*.pkl
 # except test fixtures
 !tests/data/newport-2023-06-13.osm.pbf
 !tests/data/newport-20230613_gtfs.zip
+!tests/data/gtfs/route_lookup.pkl
 
 ### Project structure ###
 data/*
@@ -36,7 +38,6 @@ outputs/*
 *.html
 *.pdf
 *.csv
-*.pkl
 *.rds
 *.rda
 *.parquet

diff --git a/conftest.py b/conftest.py
@@ -16,14 +16,32 @@ def pytest_addoption(parser):
         default=False,
         help="run set-up tests",
     )
+    parser.addoption(
+        "--runinteg",
+        action="store_true",
+        default=False,
+        help="run integration tests",
+    )
+    parser.addoption(
+        "--runexpensive",
+        action="store_true",
+        default=False,
+        help="run expensive tests",
+    )
 
 
 def pytest_configure(config):
     """Add ini value line."""
     config.addinivalue_line("markers", "setup: mark test to run during setup")
+    config.addinivalue_line(
+        "markers", "runinteg: mark test to run for integration tests"
+    )
+    config.addinivalue_line(
+        "markers", "runexpensive: mark test to run expensive tests"
+    )
 
 
-def pytest_collection_modifyitems(config, items):
+def pytest_collection_modifyitems(config, items):  # noqa C901
     """Handle switching based on cli args."""
     if config.getoption("--runsetup"):
         # --runsetup given in cli: do not skip slow tests
@@ -32,3 +50,19 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "setup" in item.keywords:
             item.add_marker(skip_setup)
+
+    if config.getoption("--runinteg"):
+        return
+    skip_runinteg = pytest.mark.skip(reason="need --runinteg option to run")
+    for item in items:
+        if "runinteg" in item.keywords:
+            item.add_marker(skip_runinteg)
+
+    if config.getoption("--runexpensive"):
+        return
+    skip_runexpensive = pytest.mark.skip(
+        reason="need --runexpensive option to run"
+    )
+    for item in items:
+        if "runexpensive" in item.keywords:
+            item.add_marker(skip_runexpensive)
diff --git a/notebooks/gtfs/check_unmatched_id_warnings.py b/notebooks/gtfs/check_unmatched_id_warnings.py
@@ -0,0 +1,93 @@
+"""Validation of invalid IDs whilst joining GTFS sub-tables."""
+
+# %%
+# imports
+import gtfs_kit as gk
+from pyprojroot import here
+import pandas as pd
+import numpy as np
+
+# %%
+# initialise my feed from GTFS test data
+feed = gk.read_feed(
+    here("tests/data/newport-20230613_gtfs.zip"), dist_units="m"
+)
+feed.validate()
+
+# %%
+# calendar test
+feed.calendar = pd.concat(
+    [
+        feed.calendar,
+        pd.DataFrame(
+            {
+                "service_id": [101],
+                "monday": [0],
+                "tuesday": [0],
+                "wednesday": [0],
+                "thursday": [0],
+                "friday": [0],
+                "saturday": [0],
+                "sunday": [0],
+                "start_date": ["20200104"],
+                "end_date": ["20230301"],
+            }
+        ),
+    ],
+    axis=0,
+)
+
+feed.validate()
+
+# %%
+# trips test
+feed.trips = pd.concat(
+    [
+        feed.trips,
+        pd.DataFrame(
+            {
+                "service_id": [101],
+                "route_id": [20304],
+                "trip_id": ["VJbedb4cfd0673348e017d42435abbdff3ddacbf89"],
+                "trip_headsign": ["Newport"],
+                "block_id": [np.nan],
+                "shape_id": ["RPSPc4c99ac6aff7e4648cbbef785f88427a48efa80f"],
+                "wheelchair_accessible": [0],
+                "trip_direction_name": [np.nan],
+                "vehicle_journey_code": ["VJ109"],
+            }
+        ),
+    ],
+    axis=0,
+)
+
+feed.validate()
+
+# %%
+# routes test
+feed.routes = pd.concat(
+    [
+        feed.routes,
+        pd.DataFrame(
+            {
+                "service_id": [101],
+                "route_id": [20304],
+                "agency_id": ["OL5060"],
+                "route_short_name": ["X145"],
+                "route_long_name": [np.nan],
+                "route_type": [200],
+            }
+        ),
+    ],
+    axis=0,
+)
+
+feed.validate()
+
+# OUTCOME
+# It appears that 'errors' are recognised when there is an attempt to validate
+# the gtfs data using the pre-built gtfs_kit functions.
+# This suggests that if the GTFS data is flawed, it will be identified within
+# the pipeline and therefore the user will be made aware. It is also flagged
+# as an error which means that 'the GTFS is violated'
+# (https://mrcagney.github.io/gtfs_kit_docs/).
diff --git a/pipeline/gtfs/01-validate-gtfs.py b/pipeline/gtfs/01-validate-gtfs.py
@@ -0,0 +1,112 @@
+"""Run the GTFS validation checks for the toml-specified GTFS file.
+
+1. read feed
+2. describe feed
+3. validate feed
+4. clean feed
+5. new  - print errors / warnings in full
+6. new - visualise convex hull of stops and area
+7. visualise stop locations
+8. new - modalities available (including extended spec)
+9. new - feed stats by is-weekend
+"""
+import toml
+from pyprojroot import here
+import time
+import subprocess
+
+from transport_performance.gtfs.validation import GtfsInstance
+from transport_performance.utils.defence import _is_gtfs_pth
+
+CONFIG = toml.load(here("pipeline/gtfs/config/01-validate-gtfs.toml"))
+GTFS_PTH = here(CONFIG["GTFS"]["PATH"])
+UNITS = CONFIG["GTFS"]["UNITS"]
+GEOM_CRS = CONFIG["GTFS"]["GEOMETRIC_CRS"]
+POINT_MAP_PTH = CONFIG["MAPS"]["STOP_COORD_PTH"]
+HULL_MAP_PATH = CONFIG["MAPS"]["STOP_HULL_PTH"]
+PROFILING = CONFIG["UTILS"]["PROFILING"]
+# check GTFS Path exists
+_is_gtfs_pth(pth=GTFS_PTH, param_nm="GTFS_PTH", check_existing=True)
+# Get the disk usage of the GTFS file.
+gtfs_du = (
+    subprocess.check_output(["du", "-sh", GTFS_PTH]).split()[0].decode("utf-8")
+)
+if PROFILING:
+    print(f"GTFS at {GTFS_PTH} disk usage: {gtfs_du}")
+
+pre_init = time.perf_counter()
+feed = GtfsInstance(gtfs_pth=GTFS_PTH, units=UNITS)
+post_init = time.perf_counter()
+if PROFILING:
+    print(f"Init in {post_init - pre_init:0.4f} seconds")
+
+available_dates = feed.feed.get_dates()
+post_dates = time.perf_counter()
+if PROFILING:
+    print(f"get_dates in {post_dates - post_init:0.4f} seconds")
+s = available_dates[0]
+f = available_dates[-1]
+print(f"{len(available_dates)} dates available between {s} & {f}.")
+
+try:
+    # If agency_id is missing, an AttributeError is raised. GTFS spec states
+    # This is conditionally required, dependent if more than one agency is
+    # operating within the feed. https://gtfs.org/schedule/reference/#agencytxt
+    # Cleaning the feed doesn't resolve. Raise issue to investigate.
+    print(feed.is_valid())
+    post_isvalid = time.perf_counter()
+    if PROFILING:
+        print(f"is_valid in {post_isvalid - post_dates:0.4f} seconds")
+    print(feed.validity_df["type"].value_counts())
+    feed.print_alerts()
+    post_errors = time.perf_counter()
+    feed.print_alerts(alert_type="warning")
+    post_warn = time.perf_counter()
+    if PROFILING:
+        print(f"print_alerts errors: {post_errors - post_isvalid:0.4f} secs")
+        print(f"print_alerts warn: {post_warn - post_errors:0.4f} secs")
+except AttributeError:
+    print("AttributeError. Unable to validate feed.")
+
+pre_clean = time.perf_counter()
+feed.clean_feed()
+post_clean = time.perf_counter()
+if PROFILING:
+    print(f"clean_feed in {post_clean - pre_clean:0.4f} seconds")
+
+try:
+    print(feed.is_valid())
+    print(feed.validity_df["type"].value_counts())
+    feed.print_alerts()
+    feed.print_alerts(alert_type="warning")
+except AttributeError:
+    print("AttributeError. Unable to validate feed.")
+
+# visualise gtfs
+pre_viz_points = time.perf_counter()
+feed.viz_stops(out_pth=POINT_MAP_PTH)
+post_viz_points = time.perf_counter()
+if PROFILING:
+    print(f"viz_points in {post_viz_points - pre_viz_points:0.4f} seconds")
+print(f"Map written to {POINT_MAP_PTH}")
+
+pre_viz_hull = time.perf_counter()
+feed.viz_stops(out_pth=HULL_MAP_PATH, geoms="hull", geom_crs=GEOM_CRS)
+post_viz_hull = time.perf_counter()
+if PROFILING:
+    print(f"viz_hull in {post_viz_hull - pre_viz_hull:0.4f} seconds")
+print(f"Map written to {HULL_MAP_PATH}")
+
+pre_route_modes = time.perf_counter()
+print(feed.get_route_modes())
+post_route_modes = time.perf_counter()
+if PROFILING:
+    print(f"route_modes in {post_route_modes - pre_route_modes:0.4f} seconds")
+
+pre_summ_weekday = time.perf_counter()
+print(feed.summarise_trips())
+print(feed.summarise_routes())
+post_summ_weekday = time.perf_counter()
+if PROFILING:
+    print(f"summ_weekday in {post_summ_weekday - pre_summ_weekday:0.4f} secs")
+    print(f"Pipeline execution in {post_summ_weekday - pre_init:0.4f}")
diff --git a/pipeline/gtfs/config/01-validate-gtfs.toml b/pipeline/gtfs/config/01-validate-gtfs.toml
@@ -0,0 +1,13 @@
+title = "Config for GTFS Validation Pipeline"
+
+[GTFS]
+PATH = "data/external/croppednewport-bus-07-07-2022_gtfs.zip"
+UNITS = "m"
+GEOMETRIC_CRS = 27700 # used for area calculations only
+
+[MAPS]
+STOP_COORD_PTH = "outputs/gtfs/validation/gtfs-stops-locations.html"
+STOP_HULL_PTH = "outputs/gtfs/validation/gtfs-stops-convex-hull.html"
+
+[UTILS]
+PROFILING = true
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,12 @@ r5py>=0.0.4
 gtfs_kit==5.2.7
 pytest
 coverage
+ipykernel==6.23.1
+pandas
+beautifulsoup4
+requests
+pytest-mock
+toml
 rasterio
 pyprojroot
 matplotlib
@@ -15,5 +21,6 @@ geocube
 mapclassify
 pytest-lazy-fixture
 seaborn
+numpy>=1.25.0 # test suite will fail if user installed lower than this
 rioxarray
 -e .
diff --git a/src/transport_performance/gtfs/__init__.py b/src/transport_performance/gtfs/__init__.py
@@ -0,0 +1 @@
+"""Helpers for working with & validating GTFS."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Helpers for working with & validating GTFS."""