From e6a700b7c52276cb492935c92ec3c4fbcf746e76 Mon Sep 17 00:00:00 2001 From: Richard Leyshon <49126943+r-leyshon@users.noreply.github.com> Date: Mon, 6 Nov 2023 12:56:17 +0000 Subject: [PATCH] 186 filter to date method in gtfsinstance (#190) * feat: Optionally filter gtfs to date list * refactor: Toml has no nonetype, so empty list used to specify no date filter * test: Invalid datestring causes raise * test: Raises if filter_dates values do not exist within gtfs calendar * refactor: All in_pth values use gtfs fixture path constant * test: Assert filtered net can build r5py network * fix: Runinteg invocations in workflows needed fixing * fix: Runner doesn't install osmosis so deselect all tests with osmosis deps * feat: Changes on bristol run * refactor: Defense checks & type hinting * docs: Add docstring to internal * chore: Move _validate_datestring to defence.py * docs: Update docstring --------- Co-authored-by: Richard Leyshon --- .github/workflows/all-os-tests.yml | 7 +- .github/workflows/python-package.yml | 2 +- notebooks/e2e/config/e2e.toml | 37 ++++----- notebooks/e2e/e2e.py | 3 +- src/transport_performance/gtfs/gtfs_utils.py | 20 +++++ src/transport_performance/utils/defence.py | 37 +++++++++ tests/gtfs/test_gtfs_utils.py | 83 ++++++++++++++++++-- 7 files changed, 156 insertions(+), 33 deletions(-) diff --git a/.github/workflows/all-os-tests.yml b/.github/workflows/all-os-tests.yml index 7c44adc0..1a06aad2 100644 --- a/.github/workflows/all-os-tests.yml +++ b/.github/workflows/all-os-tests.yml @@ -1,7 +1,7 @@ # This workflow will install OS dependencies and run a 'base' set of unit tests with Python 3.9 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python -name: Unit tests on macOS/Linux/Windows +name: Integration tests no osmosis on: push: @@ -28,7 +28,4 @@ jobs: pip install -r requirements.txt - name: Run Integration Tests # run only those tests marked runinteg & with no osmosis deps run: | - pytest -m runinteg --runinteg --deselect tests/osm/ - - name: Test with pytest # run only tests with no osmosis deps - run: | - pytest --deselect tests/osm/ + pytest --runinteg --deselect tests/osm/ --deselect tests/gtfs/test_gtfs_utils.py::TestBboxFilterGtfs::test_bbox_filter_gtfs_to_date_builds_network diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 9955734d..e1deb064 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -49,7 +49,7 @@ jobs: shell: sh - name: Run Integration Tests run: | - pytest -m runinteg --runinteg # run only those tests marked runinteg + pytest --runinteg # run only those tests marked runinteg - name: pre-commit run: | pre-commit install diff --git a/notebooks/e2e/config/e2e.toml b/notebooks/e2e/config/e2e.toml index fdbcb43f..80c86044 100644 --- a/notebooks/e2e/config/e2e.toml +++ b/notebooks/e2e/config/e2e.toml @@ -13,15 +13,12 @@ override = true input_dir = "data/external/urban_centre/" merged_path = "data/processed/urban_centre/uc_merged.tif" bbox = [ - -415194.5798256779, - 6014542.675452075, - -178899.95729310974, - 6239314.71054581 +-415194.5798256779, 6014542.675452075, -178899.95729310974, 6239314.71054581 ] # must in in 'ESRI:54009', this represents a BBOX around Wales -centre = [51.5773597, -2.9660008] +centre = [51.46737027594145, -2.5988505326255895] buffer_size = 12000 write_outputs = true -output_map_path = "outputs/e2e/urban_centre/urban_centre_map.html" +output_map_path = "outputs/e2e/bristol/urban_centre/urban_centre_map.html" [population] # configuration section for population override = true @@ -30,35 +27,35 @@ merged_path = "data/interim/population/pop_merged.tif" merged_resampled_path = "data/processed/population/pop_merged_resampled.tif" threshold = 1 # set small and positive, to remove 0 pop cells write_outputs = true -output_map_path = "outputs/e2e/population/population_map.html" +output_map_path = "outputs/e2e/bristol/population/population_map.html" [gtfs] # configuration section for gtfs override = true -input_path = "data/external/gtfs/itm_wales_gtfs.zip" -filtered_path = "data/interim/gtfs/itm_wales_filtered_gtfs.zip" +input_path = "data/external/gtfs/itm_england_gtfs.zip" +filtered_path = "data/interim/gtfs/itm_england_filtered_gtfs.zip" units = "km" write_outputs = true -cleaned_path = "data/processed/gtfs/itm_wales_filtered_cleaned_gtfs.zip" -stops_map_path = "outputs/e2e/gtfs/stops_map.html" -hull_map_path = "outputs/e2e/gtfs/hull_map.html" -used_stops_map_path = "outputs/e2e/gtfs/used_stops_map.html" +cleaned_path = "data/processed/gtfs/itm_england_filtered_cleaned_gtfs.zip" +stops_map_path = "outputs/e2e/bristol/gtfs/stops_map.html" +hull_map_path = "outputs/e2e/bristol/gtfs/hull_map.html" +used_stops_map_path = "outputs/e2e/bristol/gtfs/used_stops_map.html" [osm] # configuration section for osm clipping override = true -input_path = "data/external/osm/wales-latest.osm.pbf" -filtered_path = "data/processed/osm/wales_latest_filtered.osm.pbf" -tag_filter = false +input_path = "data/external/osm/england-latest.osm.pbf" +filtered_path = "data/processed/osm/england_latest_filtered.osm.pbf" +tag_filter = true [analyse_network] # configuration for the analyse_network stage departure_year = 2023 -departure_month = 8 -departure_day = 8 # this is the date, not the day of the week +departure_month = 10 +departure_day = 27 # this is the date, not the day of the week departure_hour = 8 departure_minute = 0 departure_time_window = 1 # this is in hours max_time = 45 # this is in minutes write_outputs = true -outputs_dir = "outputs/e2e/analyse_network/" +outputs_dir = "outputs/e2e/bristol/analyse_network/" qa_travel_times = false # set to true to qa results with `qa_path` qa_path = "outputs/e2e/analyse_network/travel_times.pkl" save_travel_times_for_qa = false # set to true to write to `save_qa_path` @@ -68,4 +65,4 @@ save_qa_path = "outputs/e2e/analyse_network/travel_times.pkl" cut_off_time = 45 # this is in minutes cut_off_distance = 11250 # this is in meters write_outputs = true -outputs_dir = "outputs/e2e/metrics/" +outputs_dir = "outputs/e2e/bristol/metrics/" diff --git a/notebooks/e2e/e2e.py b/notebooks/e2e/e2e.py index 0fcd9c6a..46ac4139 100644 --- a/notebooks/e2e/e2e.py +++ b/notebooks/e2e/e2e.py @@ -254,6 +254,7 @@ geoms="hull", create_out_parent=True, ) + gtfs.feed = gk.miscellany.restrict_to_dates(gtfs.feed, ["20231027"]) gtfs.feed.write(here(gtfs_config["cleaned_path"])) # %% @@ -543,7 +544,7 @@ def plot( # %% # visualise for an ID within the UC -UC_ID = 4110 +UC_ID = 6456 assert UC_ID in travel_times.to_id.unique() snippet_id = travel_times[travel_times.to_id == UC_ID] snippet_id = pop_gdf.merge(snippet_id, left_on="id", right_on="from_id") diff --git a/src/transport_performance/gtfs/gtfs_utils.py b/src/transport_performance/gtfs/gtfs_utils.py index 76aec2a6..5c5cc5bf 100644 --- a/src/transport_performance/gtfs/gtfs_utils.py +++ b/src/transport_performance/gtfs/gtfs_utils.py @@ -15,6 +15,7 @@ _is_expected_filetype, _check_iterable, _type_defence, + _validate_datestring, _enforce_file_extension, ) from transport_performance.utils.constants import PKG_PATH @@ -35,9 +36,12 @@ def bbox_filter_gtfs( ], units: str = "km", crs: str = "epsg:4326", + filter_dates: list = [], ) -> None: """Filter a GTFS feed to any routes intersecting with a bounding box. + Optionally filter to a list of given dates. + Parameters ---------- in_pth : Union[pathlib.Path, str], optional @@ -54,6 +58,9 @@ def bbox_filter_gtfs( crs : str, optional What projection should the `bbox_list` be interpreted as. Defaults to "epsg:4326" for lat long. + filter_dates: list, optional + A list of dates to restrict the feed to. Not providing filter_dates + means that date filtering will not be applied. Defaults to []. Returns ------- @@ -79,6 +86,7 @@ def bbox_filter_gtfs( "crs": [crs, str], "out_pth": [out_pth, (str, pathlib.Path)], "in_pth": [in_pth, (str, pathlib.Path)], + "filter_dates": [filter_dates, list], } for k, v in typing_dict.items(): _type_defence(v[0], k, v[-1]) @@ -101,6 +109,18 @@ def bbox_filter_gtfs( feed = gk.read_feed(in_pth, dist_units=units) restricted_feed = gk.miscellany.restrict_to_area(feed=feed, area=bbox) + # optionally retrict to a date + if len(filter_dates) > 0: + _check_iterable(filter_dates, "filter_dates", list, exp_type=str) + # check date format is acceptable + [_validate_datestring(x) for x in filter_dates] + feed_dates = restricted_feed.get_dates() + diff = set(filter_dates).difference(feed_dates) + if diff: + raise ValueError(f"{diff} not present in feed dates.") + restricted_feed = gk.miscellany.restrict_to_dates( + restricted_feed, filter_dates + ) restricted_feed.write(out_pth) print(f"Filtered feed written to {out_pth}.") diff --git a/src/transport_performance/utils/defence.py b/src/transport_performance/utils/defence.py index c7956a6f..2d7a9382 100644 --- a/src/transport_performance/utils/defence.py +++ b/src/transport_performance/utils/defence.py @@ -7,6 +7,7 @@ import os import pandas as pd import warnings +from datetime import datetime def _handle_path_like( @@ -494,3 +495,39 @@ def _enforce_file_extension( warnings.warn(msg, UserWarning) path = os.path.normpath(root + f".{default_ext}") return pathlib.Path(path) + + +def _validate_datestring(date_text: str, form: str = "%Y%m%d") -> None: + """Ensure a date string conforms to the expected form. + + Parameters + ---------- + date_text : str + The datestring to be checked. + form : str, optional + The expected date format. Must be a valid C-standard date code format + compatible with datetime. See https://docs.python.org/3/library/datetim + e.html#strftime-and-strptime-format-codes for examples. Defaults to + "%Y%m%d". + + Returns + ------- + None + + Raises + ------ + ValueError + `date_text` is not in the expected `form`. + TypeError + `date_text` or `form` are not string. + + """ + _type_defence(date_text, "date_text", str) + _type_defence(form, "form", str) + try: + datetime.strptime(date_text, form) + except ValueError: + raise ValueError( + f"Incorrect date format, {date_text} should be {form}" + ) + return None diff --git a/tests/gtfs/test_gtfs_utils.py b/tests/gtfs/test_gtfs_utils.py index 8958625d..d036d846 100644 --- a/tests/gtfs/test_gtfs_utils.py +++ b/tests/gtfs/test_gtfs_utils.py @@ -15,6 +15,7 @@ _add_validation_row, filter_gtfs_around_trip, convert_pandas_to_plotly, + _validate_datestring, ) # location of GTFS test fixture @@ -45,9 +46,7 @@ def test_bbox_filter_gtfs_writes_with_bbox_list(self, bbox_list, tmpdir): tmpdir, "newport-train-station-bboxlist_gtfs.zip" ) bbox_filter_gtfs( - in_pth=os.path.join( - "tests", "data", "gtfs", "newport-20230613_gtfs.zip" - ), + GTFS_FIX_PTH, out_pth=pathlib.Path(tmp_out), bbox=bbox_list, ) @@ -71,9 +70,7 @@ def test_bbox_filter_gtfs_writes_with_bbox_gdf(self, bbox_list, tmpdir): ) bbox_filter_gtfs( - in_pth=os.path.join( - "tests", "data", "gtfs", "newport-20230613_gtfs.zip" - ), + in_pth=GTFS_FIX_PTH, out_pth=pathlib.Path(tmp_out), bbox=bbox_gdf, ) @@ -87,6 +84,63 @@ def test_bbox_filter_gtfs_writes_with_bbox_gdf(self, bbox_list, tmpdir): feed, GtfsInstance ), f"Expected class `Gtfs_Instance but found: {type(feed)}`" + def test_bbox_filter_gtfs_raises_date_not_in_gtfs(self, bbox_list, tmpdir): + """Test raises if filter date is not found within the GTFS calendar.""" + with pytest.raises( + ValueError, match="{'30000101'} not present in feed dates." + ): + bbox_filter_gtfs( + in_pth=GTFS_FIX_PTH, + out_pth=os.path.join(tmpdir, "foobar.zip"), + bbox=bbox_list, + filter_dates=["30000101"], + ) + + def test_bbox_filter_gtfs_filters_to_date(self, bbox_list, tmpdir): + """Test filtered GTFS behaves as expected.""" + out_pth = os.path.join(tmpdir, "out.zip") + # filter to date of fixture ingest + bbox_filter_gtfs( + in_pth=GTFS_FIX_PTH, + out_pth=out_pth, + bbox=bbox_list, + filter_dates=["20230613"], + ) + assert os.path.exists( + out_pth + ), f"Expected filtered GTFS was not found at {out_pth}" + # compare dates + fix = GtfsInstance(GTFS_FIX_PTH) + fix_stops_count = len(fix.feed.stops) + filtered = GtfsInstance(out_pth) + filtered_stops_count = len(filtered.feed.stops) + assert ( + fix_stops_count > filtered_stops_count + ), f"Expected fewer than {fix_stops_count} in filtered GTFS but" + " found {filtered_stops_count}" + + @pytest.mark.runinteg + def test_bbox_filter_gtfs_to_date_builds_network(self, bbox_list, tmpdir): + """Having this flagged as integration test as Java dependency.""" + # import goes here to avoid Java warnings as in test_setup.py + import r5py + + out_pth = os.path.join(tmpdir, "out.zip") + # filter to date of fixture ingest + bbox_filter_gtfs( + in_pth=GTFS_FIX_PTH, + out_pth=out_pth, + bbox=bbox_list, + filter_dates=["20230613"], + ) + net = r5py.TransportNetwork( + osm_pbf=os.path.join( + "tests", "data", "newport-2023-06-13.osm.pbf" + ), + gtfs=[out_pth], + ) + assert isinstance(net, r5py.TransportNetwork) + class Test_AddValidationRow(object): """Tests for _add_validation_row().""" @@ -203,3 +257,20 @@ def test_convert_pandas_to_plotly_on_pass(self, test_df): "Expected type plotly.graph_objects.Figure but " f"{type(fig_return)} found" ) + + +class Test_ValidateDatestring(object): + """Tests for _validate_datestring.""" + + def test_validate_datestring_raises(self): + """Check incompatible datestrings raise.""" + with pytest.raises( + ValueError, + match="Incorrect date format, 2023-10-23 should be %Y%m%d", + ): + _validate_datestring("2023-10-23") + + def test_validate_datestring_on_pass(self): + """Test that func passes if datestring matches specified form.""" + out = _validate_datestring("2023-10-23", form="%Y-%m-%d") + assert isinstance(out, type(None))