add _function_pipeline(); updaet clean_feed to use;update tests; impl…

…ement core cleaner
datasciencecampus · Oct 19, 2023 · ad38cac · ad38cac
1 parent 5f97b53
commit ad38cac
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 103 deletions.
diff --git a/src/transport_performance/gtfs/cleaners.py b/src/transport_performance/gtfs/cleaners.py
@@ -216,19 +216,19 @@ def core_cleaners(
     _type_defence(drop_zombies, "drop_zombies", bool)
     # cleaning
     if clean_ids:
-        clean_ids_gk(gtfs)
+        clean_ids_gk(gtfs.feed)
     if clean_times:
-        clean_times_gk(gtfs)
+        clean_times_gk(gtfs.feed)
     if clean_route_short_names:
-        clean_route_short_names_gk(gtfs)
+        clean_route_short_names_gk(gtfs.feed)
     if drop_zombies:
         try:
-            drop_zombies_gk(gtfs)
+            drop_zombies_gk(gtfs.feed)
         except KeyError:
             warnings.warn(
                 UserWarning(
                     "The drop_zombies cleaner was unable to operate on "
-                    "clean_feed as the trips table ahs no sape_id column"
+                    "clean_feed as the trips table has no shape_id column"
                 )
             )
     return None
diff --git a/src/transport_performance/gtfs/gtfs_utils.py b/src/transport_performance/gtfs/gtfs_utils.py
@@ -426,3 +426,36 @@ def _remove_validation_row(
         list(set(gtfs.validity_df.index) - set(index))
     ]
     return None
+
+
+def _function_pipeline(
+    gtfs, func_map: dict, operations: Union[dict, type(None)]
+) -> None:
+    """Iterate through and act on a functional pipeline."""
+    _gtfs_defence(gtfs, "gtfs")
+    _type_defence(func_map, "func_map", dict)
+    _type_defence(operations, "oeprations", (dict, type(None)))
+    if operations:
+        for key in operations.keys():
+            if key not in func_map.keys():
+                raise KeyError(
+                    f"'{key}' function passed to 'operations' is not a "
+                    "known operation. Known operation include: "
+                    f"{func_map.keys()}"
+                )
+        for operation in operations:
+            # check value is dict or none (for kwargs)
+            _type_defence(
+                operations[operation],
+                f"operations[{operation}]",
+                (dict, type(None)),
+            )
+            operations[operation] = (
+                {} if operations[operation] is None else operations[operation]
+            )
+            func_map[operation](gtfs=gtfs, **operations[operation])
+    # if no operations passed, carry out all operations
+    else:
+        for operation in func_map:
+            func_map[operation](gtfs=gtfs)
+    return None
diff --git a/src/transport_performance/gtfs/validation.py b/src/transport_performance/gtfs/validation.py
@@ -16,17 +16,15 @@
 from typing import Union, Callable
 from plotly.graph_objects import Figure as PlotlyFigure
 
-from transport_performance.gtfs.cleaners import (
-    clean_consecutive_stop_fast_travel_warnings,
-    clean_multiple_stop_fast_travel_warnings,
-)
 import transport_performance.gtfs.cleaners as cleaners
 import transport_performance.gtfs.validators as gtfs_validators
 
 from transport_performance.gtfs.routes import (
     scrape_route_type_lookup,
     get_saved_route_type_lookup,
 )
+
+from transport_performance.gtfs.gtfs_utils import _function_pipeline
 from transport_performance.utils.defence import (
     _is_expected_filetype,
     _check_namespace_export,
@@ -382,35 +380,9 @@ def is_valid(self, validators: dict = None) -> pd.DataFrame:
         self.validity_df = pd.DataFrame(
             columns=["type", "message", "table", "rows"]
         )
-        # carry out additional validators
-        if validators is not None:
-            # check all keys are known validators
-            for key in validators.keys():
-                if key not in VALIDATE_FEED_FUNC_MAP.keys():
-                    raise KeyError(
-                        f"'{key}' function passed to 'validators' is not a "
-                        "known validator. Known validators include: "
-                        f"{VALIDATE_FEED_FUNC_MAP.keys()}"
-                    )
-            for validator in validators:
-                # check value is dict or none (for kwargs)
-                _type_defence(
-                    validators[validator],
-                    f"validators[{validator}]",
-                    (dict, type(None)),
-                )
-                validators[validator] = (
-                    {}
-                    if validators[validator] is None
-                    else validators[validator]
-                )
-                VALIDATE_FEED_FUNC_MAP[validator](
-                    gtfs=self, **validators[validator]
-                )
-        # if no validators passed, carry out all validators
-        else:
-            for validator in VALIDATE_FEED_FUNC_MAP:
-                VALIDATE_FEED_FUNC_MAP[validator](gtfs=self)
+        _function_pipeline(
+            gtfs=self, func_map=VALIDATE_FEED_FUNC_MAP, operations=validators
+        )
         return self.validity_df
 
     def print_alerts(self, alert_type: str = "error") -> None:
@@ -461,35 +433,27 @@ def print_alerts(self, alert_type: str = "error") -> None:
 
         return None
 
-    def clean_feed(
-        self, validate: bool = False, fast_travel: bool = True
-    ) -> None:
-        """Attempt to clean feed using `gtfs_kit`.
+    def clean_feed(self, cleansers: dict = None) -> None:
+        """Clean the gtfs feed.
 
         Parameters
         ----------
-        validate: bool, optional
-            Whether or not to validate the dataframe before cleaning
-        fast_travel: bool, optional
-            Whether or not to clean warnings related to fast travel.
+        cleansers : dict, optional
+            A mapping of cleansing functions and kwargs, by default None
+
+        Returns
+        -------
+        None
 
         """
-        _type_defence(fast_travel, "fast_travel", bool)
-        _type_defence(validate, "valiidate", bool)
-        # TODO: refactor function to be like is_valid()
-        if validate:
-            self.is_valid()
-        try:
-            # In cases where shape_id is missing, keyerror is raised.
-            # https://developers.google.com/transit/gtfs/reference#shapestxt
-            # shows that shapes.txt is optional file.
-            self.feed = self.feed.clean()
-            if fast_travel:
-                clean_consecutive_stop_fast_travel_warnings(self)
-                clean_multiple_stop_fast_travel_warnings(self)
-        except KeyError:
-            # TODO: Issue 74 - Improve this to clean feed when KeyError raised
-            print("KeyError. Feed was not cleaned.")
+        # DEV NOTE: Opting not to allow for validation in clean_feed().
+        #           .is_valid() should be used before hand.
+        # DEV NOTE 2: Use of parm name 'cleansers' is to avoid conflicts
+        _type_defence(cleansers, "cleansers", (dict, type(None)))
+        _function_pipeline(
+            gtfs=self, func_map=CLEAN_FEED_FUNCTION_MAP, operations=cleansers
+        )
+        return None
 
     def _produce_stops_map(
         self, what_geoms: str, is_filtered: bool, crs: Union[int, str]
@@ -1514,8 +1478,9 @@ def html_report(
         date = datetime.datetime.strftime(datetime.datetime.now(), "%d-%m-%Y")
 
         # feed evaluation
-        # TODO: make this optional
-        self.clean_feed(validate=True, fast_travel=True)
+        # TODO: make this optional (and allow params)
+        self.is_valid()
+        self.clean_feed()
         # re-validate to clean any newly raised errors/warnings
         validation_dataframe = self.is_valid()
 

diff --git a/tests/gtfs/test_gtfs_utils.py b/tests/gtfs/test_gtfs_utils.py
@@ -10,14 +10,18 @@
 from plotly.graph_objects import Figure as PlotlyFigure
 import numpy as np
 
-from transport_performance.gtfs.validation import GtfsInstance
+from transport_performance.gtfs.validation import (
+    GtfsInstance,
+    VALIDATE_FEED_FUNC_MAP,
+)
 from transport_performance.gtfs.gtfs_utils import (
     bbox_filter_gtfs,
     _add_validation_row,
     filter_gtfs_around_trip,
     convert_pandas_to_plotly,
     _get_validation_warnings,
     _remove_validation_row,
+    _function_pipeline,
 )
 
 # location of GTFS test fixture
@@ -332,3 +336,42 @@ def test__remove_validation_row_on_pass(self):
         assert (
             len(found_cols) == 0
         ), "Invalid errors/warnings still in validity_df"
+
+
+class TestFunctionPipeline(object):
+    """Tests for _function_pipeline.
+
+    Notes
+    -----
+    Not testing on pass here as better cases can be found in the tests for
+    GtfsInstance's is_valid() and clean_feed() methods.
+
+    """
+
+    @pytest.mark.parametrize(
+        "operations, raises, match",
+        [
+            # invalid type for 'validators'
+            (True, TypeError, ".*expected .*dict.*. Got .*bool.*"),
+            # invalid validator
+            (
+                {"not_a_valid_validator": None},
+                KeyError,
+                (
+                    r"'not_a_valid_validator' function passed to 'operations'"
+                    r" is not a known operation.*"
+                ),
+            ),
+            # invalid type for kwargs for validator
+            (
+                {"core_validation": pd.DataFrame()},
+                TypeError,
+                ".* expected .*dict.*NoneType.*",
+            ),
+        ],
+    )
+    def test_function_pipeline_defence(self, operations, raises, match):
+        """Defensive test for _function_pipeline."""
+        gtfs = GtfsInstance(GTFS_FIX_PTH)
+        with pytest.raises(raises, match=match):
+            _function_pipeline(gtfs, VALIDATE_FEED_FUNC_MAP, operations)
diff --git a/tests/gtfs/test_validation.py b/tests/gtfs/test_validation.py
@@ -142,35 +142,6 @@ def test_get_gtfs_files(self, newp_gtfs_fixture):
         ), f"GTFS files not as expected. Expected {expected_files},"
         "found: {foundf}"
 
-    @pytest.mark.parametrize(
-        "validators, raises, match",
-        [
-            # invalid type for 'validators'
-            (True, TypeError, ".*expected .*dict.*. Got .*bool.*"),
-            # invalid validator
-            (
-                {"not_a_valid_validator": None},
-                KeyError,
-                (
-                    r"'not_a_valid_validator' function passed to 'validators'"
-                    r" is not a known validator.*"
-                ),
-            ),
-            # invalid type for kwargs for validator
-            (
-                {"core_validation": pd.DataFrame()},
-                TypeError,
-                ".* expected .*dict.*NoneType.*",
-            ),
-        ],
-    )
-    def test_is_valid_defence(
-        self, newp_gtfs_fixture, validators, raises, match
-    ):
-        """Defensive tests for GtfsInstance.is_valid()."""
-        with pytest.raises(raises, match=match):
-            newp_gtfs_fixture.is_valid(validators=validators)
-
     @pytest.mark.parametrize(
         "which, validators, shape",
         [
@@ -742,16 +713,14 @@ def dummy_func():
         ):
             newp_gtfs_fixture.summarise_routes(return_summary="true")
 
-    @patch("builtins.print")
-    def test_clean_feed_defence(self, mock_print, newp_gtfs_fixture):
+    def test_clean_feed_defence(self, newp_gtfs_fixture):
         """Check defensive behaviours of clean_feed()."""
-        # Simulate condition where shapes.txt has no shape_id
-        newp_gtfs_fixture.feed.shapes.drop("shape_id", axis=1, inplace=True)
-        newp_gtfs_fixture.clean_feed()
-        fun_out = mock_print.mock_calls
-        assert fun_out == [
-            call("KeyError. Feed was not cleaned.")
-        ], f"Expected print statement about KeyError. Found: {fun_out}."
+        with pytest.raises(
+            TypeError, match=r".*expected .*dict.* Got .*int.*"
+        ):
+            fixt = newp_gtfs_fixture
+            fixt.is_valid(validators={"core_validation": None})
+            fixt.clean_feed(cleansers=1)
 
     def test_summarise_trips_on_pass(self, newp_gtfs_fixture):
         """Assertions about the outputs from summarise_trips()."""