Skip to content

Commit

Permalink
chore: merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
SergioRec committed Oct 16, 2023
2 parents 65f8c97 + 21a58ac commit e559908
Show file tree
Hide file tree
Showing 14 changed files with 1,181 additions and 13 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

# exception for test fixtures/package data
!tests/data/newport-2023-06-13.osm.pbf
!tests/data/chester-20230816-small_gtfs.zip
!tests/data/gtfs/newport-20230613_gtfs.zip
!src/transport_performance/data/gtfs/route_lookup.pkl
!tests/data/gtfs/report/html_template.html
Expand Down
78 changes: 78 additions & 0 deletions notebooks/gtfs/stop_id_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""A notebook containing viable code for additional stop_id validation."""
# %%
# IMPORTS
from transport_performance.gtfs.validation import GtfsInstance
from transport_performance.gtfs.gtfs_utils import _add_validation_row


# %%
# SAMPLE FUNCTION
def validate_stops(gtfs: GtfsInstance) -> None:
"""Validate stop_ids across stops and stop_times tables.
Parameters
----------
gtfs : GtfsInstance
The GTFS instance to validate
Returns
-------
None
"""
stops = gtfs.feed.stops.copy()
stop_times = gtfs.feed.stop_times.copy()

# determine which stops are parent stops and remove them
parents = stops.parent_station.unique()
stops = stops[~stops.stop_id.isin(parents)]

# get unique stop_ids from both tables as dataframes

stops_ids = stops[["stop_id"]].drop_duplicates()
stop_t_ids = stop_times[["stop_id"]].drop_duplicates()

stops_ids["valid"] = 1
stop_t_ids["valid"] = 1
merged_ids = stops_ids.merge(
stop_t_ids, how="outer", on="stop_id", suffixes=("_s", "_st")
)

stop_t_count = merged_ids["valid_st"].isna().count()
stop_t_invalid_ids = merged_ids[
merged_ids["valid_st"].isna()
].stop_id.unique()

stops_count = merged_ids["valid_s"].isna().count()
stops_invalid_ids = merged_ids[
merged_ids["valid_s"].isna()
].stop_id.unique()

if stops_count > 0:
impacted_rows = list(
gtfs.feed.stop_times[
gtfs.feed.stop_times.stop_id.isin(stops_invalid_ids)
].index
)
_add_validation_row(
gtfs=gtfs,
_type="warning",
message="stop_id's exist in stop_times but not in stops",
table="stop_times",
rows=impacted_rows,
)

if stop_t_count > 0:
impacted_rows = list(
gtfs.feed.stops[
gtfs.feed.stops.stop_id.isin(stop_t_invalid_ids)
].index
)
_add_validation_row(
gtfs=gtfs,
_type="warning",
message="stop_id's exist in stops but not in stop_times",
table="stops",
rows=impacted_rows,
)
return None
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ cartopy
folium
mapclassify
seaborn
haversine
pretty_html_table
kaleido
numpy>=1.25.0 # test suite will fail if user installed lower than this
Expand Down
166 changes: 166 additions & 0 deletions src/transport_performance/gtfs/cleaners.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""A set of functions that clean the gtfs data."""
from typing import Union

import numpy as np

from transport_performance.utils.defence import _gtfs_defence, _check_list


def drop_trips(gtfs, trip_id: Union[str, list, np.ndarray]) -> None:
"""Drop trip{s} from a GtfsInstance object.
Parameters
----------
gtfs : GtfsInstance
The GtfsInstance object to drop the trip(s) from
trip_id : Union[str, list, np.ndarray]
The trip ID(s) of the trip to be dropped from the gtfs data.
Returns
-------
None
"""
# defences
_gtfs_defence(gtfs, "gtfs")
if not isinstance(trip_id, (str, list, np.ndarray)):
raise TypeError(
f"'trip_id' received type: {type(trip_id)}. "
"Expected types: [str, list, np.ndarray]"
)
# ensure trip ID is an iterable
if isinstance(trip_id, str):
trip_id = [trip_id]

# _check_list only takes lists, therefore convert numpy arrays
if isinstance(trip_id, np.ndarray):
trip_id = list(trip_id)

# ensure trip ids are string
_check_list(
ls=trip_id, param_nm="trip_id", check_elements=True, exp_type=str
)

# drop relevant records from tables
gtfs.feed.trips = gtfs.feed.trips[
~gtfs.feed.trips["trip_id"].isin(trip_id)
]
gtfs.feed.stop_times = gtfs.feed.stop_times[
~gtfs.feed.stop_times["trip_id"].isin(trip_id)
]

# finish cleaning up deleted trips
gtfs.feed = gtfs.feed.drop_zombies()

# re-run so that summaries can be updated
gtfs.pre_processed_trips = gtfs._preprocess_trips_and_routes()
return None


def clean_consecutive_stop_fast_travel_warnings(
gtfs, validate: bool = False
) -> None:
"""Clean 'Fast Travel Between Consecutive Stops' warnings from validity_df.
Parameters
----------
gtfs : GtfsInstance
The GtfsInstance to clean warnings within
validate : bool, optional
Whether or not to validate the gtfs before carrying out this cleaning
operation
Returns
-------
None
"""
# defences
_gtfs_defence(gtfs, "gtfs")
if "validity_df" not in gtfs.__dict__.keys() and not validate:
raise AttributeError(
"The gtfs has not been validated, therefore no"
"warnings can be identified. You can pass "
"validate=True to this function to validate the "
"gtfs."
)

if validate:
gtfs.is_valid()

needed_warning = (
gtfs.validity_df[
gtfs.validity_df["message"]
== "Fast Travel Between Consecutive Stops"
]
.copy()
.values
)

if len(needed_warning) < 1:
return None

trip_ids = gtfs.full_stop_schedule.loc[
needed_warning[0][3]
].trip_id.unique()

# drop trips from tables
drop_trips(gtfs=gtfs, trip_id=trip_ids)
gtfs.full_stop_schedule = gtfs.full_stop_schedule[
~gtfs.full_stop_schedule["trip_id"].isin(trip_ids)
]
return None


def clean_multiple_stop_fast_travel_warnings(
gtfs, validate: bool = False
) -> None:
"""Clean 'Fast Travel Over Multiple Stops' warnings from validity_df.
Parameters
----------
gtfs : GtfsInstance
The GtfsInstance to clean warnings within
validate : bool, optional
Whether or not to validate the gtfs before carrying out this cleaning
operation
Returns
-------
None
"""
# defences
_gtfs_defence(gtfs, "gtfs")
if "validity_df" not in gtfs.__dict__.keys() and not validate:
raise AttributeError(
"The gtfs has not been validated, therefore no"
"warnings can be identified. You can pass "
"validate=True to this function to validate the "
"gtfs."
)

if validate:
gtfs.is_valid()

needed_warning = (
gtfs.validity_df[
gtfs.validity_df["message"] == "Fast Travel Over Multiple Stops"
]
.copy()
.values
)

if len(needed_warning) < 1:
return None

trip_ids = gtfs.multiple_stops_invalid.loc[
needed_warning[0][3]
].trip_id.unique()

# drop trips from tables
drop_trips(gtfs=gtfs, trip_id=trip_ids)
gtfs.multiple_stops_invalid = gtfs.multiple_stops_invalid[
~gtfs.multiple_stops_invalid["trip_id"].isin(trip_ids)
]
return None
Loading

0 comments on commit e559908

Please sign in to comment.