diff --git a/jobs/gtfs-schedule-validator/Dockerfile b/jobs/gtfs-schedule-validator/Dockerfile index aee07bd441..9f1ac5762b 100644 --- a/jobs/gtfs-schedule-validator/Dockerfile +++ b/jobs/gtfs-schedule-validator/Dockerfile @@ -28,6 +28,10 @@ COPY ./gtfs-validator-4.1.0-cli.jar ${V4_1_VALIDATOR_JAR} ENV V4_2_VALIDATOR_JAR=/gtfs-validator-4.2.0-cli.jar COPY ./gtfs-validator-4.2.0-cli.jar ${V4_2_VALIDATOR_JAR} +# v5 from https://github.com/MobilityData/gtfs-validator/releases/download/v5.0.0/gtfs-validator-5.0.0-cli.jar +ENV V5_VALIDATOR_JAR=/gtfs-validator-5.0.0-cli.jar +COPY ./gtfs-validator-5.0.0-cli.jar ${V5_VALIDATOR_JAR} + WORKDIR /app COPY ./pyproject.toml /app/pyproject.toml diff --git a/jobs/gtfs-schedule-validator/README.md b/jobs/gtfs-schedule-validator/README.md index c4c0c7213c..612bb7f36e 100644 --- a/jobs/gtfs-schedule-validator/README.md +++ b/jobs/gtfs-schedule-validator/README.md @@ -22,3 +22,36 @@ available version of the validator. Instead, we use extract dates to determine w version of the validator was correct to use at the time the data was created. That way, we don't "punish" older data for not conforming to expectations that changed in the time since data creation. + +## Upgrading the Schedule Validator Version tips +If you run into trouble when adding the new validator jar, it's because the default set for check-added-large-files in our pre-commit config which is a relatively low 500Kb. It's more meant as an alarm for local development than as an enforcement mechanism. +You can make one commit that adds the jar and temporarily adds a higher file size threshold to the pre-commit config [like this one](https://github.com/cal-itp/data-infra/pull/2893/commits/7d40c81f2f5a2622123d4ac5dbbb064eb35565c6) and then a second commit that removes the threshold modification [like this one](https://github.com/cal-itp/data-infra/pull/2893/commits/1ec4e4a1f30ac95b9c0edffcf1f2b12e53e40733). That'll get the file through. + +Remember you need to rebuild and push the latest docker file to `dhcr.io` before changes will be reflected in airflow runs. + +You will need to parse the `rules.json` from the mobility validator. Here is a code example for the upgrade to v5: +``` +# https://github.com/MobilityData/gtfs-validator/releases/tag/v5.0.0 +import json +import pandas as pd + +# Replace with your JSON data +with open('rules.json') as f: + data = json.load(f) +result = [] +for key in data.keys(): + # print(key) + result.append({ + 'code': data[key]['code'], + 'human_readable_description': data[key]['shortSummary'], + 'version': 'v5.0.0', + 'severity': data[key]['severityLevel'] + }) +# Create CSV +df = pd.DataFrame(result) +df.to_csv('gtfs_schedule_validator_rule_details_v5_0_0.csv', index=False) +``` + +Here is a command to test once you have appropriate gtfs zip files in the test bucket: + +`docker-compose run airflow tasks test unzip_and_validate_gtfs_schedule_hourly validate_gtfs_schedule YYYY-MM-DDTHH:MM:SS` diff --git a/jobs/gtfs-schedule-validator/gtfs-validator-5.0.0-cli.jar b/jobs/gtfs-schedule-validator/gtfs-validator-5.0.0-cli.jar new file mode 100644 index 0000000000..3a51259ea8 Binary files /dev/null and b/jobs/gtfs-schedule-validator/gtfs-validator-5.0.0-cli.jar differ diff --git a/jobs/gtfs-schedule-validator/gtfs_schedule_validator_hourly.py b/jobs/gtfs-schedule-validator/gtfs_schedule_validator_hourly.py index 0b457659c7..fbcdf1e144 100644 --- a/jobs/gtfs-schedule-validator/gtfs_schedule_validator_hourly.py +++ b/jobs/gtfs-schedule-validator/gtfs_schedule_validator_hourly.py @@ -41,6 +41,7 @@ V4_VALIDATOR_JAR = os.getenv("V4_VALIDATOR_JAR") V4_1_VALIDATOR_JAR = os.getenv("V4_1_VALIDATOR_JAR") V4_2_VALIDATOR_JAR = os.getenv("V4_2_VALIDATOR_JAR") +V5_VALIDATOR_JAR = os.getenv("V5_VALIDATOR_JAR") JAR_DEFAULT = typer.Option( default=os.environ.get(SCHEDULE_VALIDATOR_JAR_LOCATION_ENV_KEY), @@ -161,9 +162,12 @@ def execute_schedule_validator( elif extract_ts.date() < pendulum.Date(2024, 1, 20): versioned_jar_path = V4_1_VALIDATOR_JAR validator_version = "v4.1.0" - else: + elif extract_ts.date() < pendulum.Date(2024, 3, 27): versioned_jar_path = V4_2_VALIDATOR_JAR validator_version = "v4.2.0" + else: + versioned_jar_path = V5_VALIDATOR_JAR + validator_version = "v5.0.0" assert versioned_jar_path @@ -181,7 +185,6 @@ def execute_schedule_validator( report_path = Path(output_dir) / "report.json" system_errors_path = Path(output_dir) / "system_errors.json" - log(f"executing schedule validator: {' '.join(args)}", pbar=pbar) subprocess.run( args, diff --git a/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql b/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql index 249d21f826..28f493e522 100644 --- a/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql +++ b/warehouse/models/intermediate/gtfs_quality/int_gtfs_quality__schedule_validator_rule_details_unioned.sql @@ -6,6 +6,7 @@ WITH unioned AS ( ref('gtfs_schedule_validator_rule_details_v4_0_0'), ref('gtfs_schedule_validator_rule_details_v4_1_0'), ref('gtfs_schedule_validator_rule_details_v4_2_0'), + ref('gtfs_schedule_validator_rule_details_v5_0_0'), ], ) }} ), diff --git a/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml b/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml index 5d238b5f1a..082fe8d53f 100644 --- a/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml +++ b/warehouse/models/mart/gtfs_quality/_mart_gtfs_quality.yml @@ -90,7 +90,11 @@ models: where: validation_validator_version = 'v4.1.0' - dbt_utils.accepted_range: min_value: "DATE'2024-01-20'" + max_value: "DATE'2024-03-26'" where: validation_validator_version = 'v4.2.0' + - dbt_utils.accepted_range: + min_value: "DATE'2024-03-27'" + where: validation_validator_version = 'v5.0.0' - &schedule_feed_key name: feed_key tests: @@ -115,7 +119,7 @@ models: tests: - not_null - accepted_values: - values: ['v2.0.0', 'v3.1.1', 'v4.0.0', 'v4.1.0', 'v4.2.0'] + values: ['v2.0.0', 'v3.1.1', 'v4.0.0', 'v4.1.0', 'v4.2.0', 'v5.0.0'] - &schedule_validator_code name: code description: | diff --git a/warehouse/seeds/_seeds.yml b/warehouse/seeds/_seeds.yml index 1f4df1d455..0f8fd7c866 100644 --- a/warehouse/seeds/_seeds.yml +++ b/warehouse/seeds/_seeds.yml @@ -151,6 +151,26 @@ seeds: tests: - not_null + - name: gtfs_schedule_validator_rule_details_v5_0_0 + description: | + A list of validation codes output by the GTFS Schedule validator, and their severities and descriptions. + This data was manually parsed from the contents of the RULES.md file in the v5.0.0 release of the validator, + sourced from: https://github.com/MobilityData/gtfs-validator/releases/tag/v5.0.0 + columns: + - name: code + tests: + - not_null + - unique + - name: human_readable_description + tests: + - not_null + - name: version + tests: + - not_null + - name: severity + tests: + - not_null + - name: _deprecated__ntd_agency_to_organization description: | *Deprecated May 2023 in favor of the `organizations.raw_ntd_id` column entered directly in Airtable.* diff --git a/warehouse/seeds/gtfs_schedule_validator_rule_details_v5_0_0.csv b/warehouse/seeds/gtfs_schedule_validator_rule_details_v5_0_0.csv new file mode 100644 index 0000000000..963051427b --- /dev/null +++ b/warehouse/seeds/gtfs_schedule_validator_rule_details_v5_0_0.csv @@ -0,0 +1,125 @@ +code,human_readable_description,version,severity +attribution_without_role,Attribution with no role.,v5.0.0,WARNING +block_trips_with_overlapping_stop_times,Trips with the same block id have overlapping stop times.,v5.0.0,ERROR +csv_parsing_failed,Parsing of a CSV file failed.,v5.0.0,ERROR +decreasing_or_equal_stop_time_distance,Decreasing or equal `shape_dist_traveled` in `stop_times.txt`.,v5.0.0,ERROR +decreasing_shape_distance,Decreasing `shape_dist_traveled` in `shapes.txt`.,v5.0.0,ERROR +duplicate_fare_media,Two distinct fare media have the same fare media name and type.,v5.0.0,WARNING +duplicate_key,Duplicated entity.,v5.0.0,ERROR +duplicate_route_name,"Two distinct routes have either the same `route_short_name`, the same `route_long_name`, or the same combination of `route_short_name` and `route_long_name`.",v5.0.0,WARNING +duplicated_column,Duplicated column in CSV.,v5.0.0,ERROR +empty_column_name,A column name is empty.,v5.0.0,ERROR +empty_file,A CSV file is empty.,v5.0.0,ERROR +empty_row,A row in the input file has only spaces.,v5.0.0,WARNING +equal_shape_distance_diff_coordinates,Two consecutive points have equal `shape_dist_traveled` and different lat/lon coordinates in `shapes.txt` and the distance between the two points is greater than the 1.11m.,v5.0.0,ERROR +equal_shape_distance_diff_coordinates_distance_below_threshold,Two consecutive points have equal `shape_dist_traveled` and different lat/lon coordinates in `shapes.txt` and the distance between the two points is less than 1.11m.,v5.0.0,WARNING +equal_shape_distance_same_coordinates,Two consecutive points have equal `shape_dist_traveled` and the same lat/lon coordinates in `shapes.txt`.,v5.0.0,WARNING +expired_calendar,Dataset should not contain date ranges for services that have already expired.,v5.0.0,WARNING +fare_transfer_rule_duration_limit_type_without_duration_limit,A row from GTFS file `fare_transfer_rules.txt` has a defined `duration_limit_type` field but no `duration_limit` specified.,v5.0.0,ERROR +fare_transfer_rule_duration_limit_without_type,A row from GTFS file `fare_transfer_rules.txt` has a defined `duration_limit` field but no `duration_limit_type` specified.,v5.0.0,ERROR +fare_transfer_rule_invalid_transfer_count,A row from GTFS file `fare_transfer_rules.txt` has a defined `transfer_count` with an invalid value.,v5.0.0,ERROR +fare_transfer_rule_missing_transfer_count,"A row from `fare_transfer_rules.txt` has `from_leg_group_id` equal to `to_leg_group_id`, but has no `transfer_count` specified.",v5.0.0,ERROR +fare_transfer_rule_with_forbidden_transfer_count,"A row from `fare_transfer_rules.txt` has `from_leg_group_id` not equal to `to_leg_group_id`, but has `transfer_count` specified.",v5.0.0,ERROR +fast_travel_between_consecutive_stops,A transit vehicle moves too fast between two consecutive stops.,v5.0.0,WARNING +fast_travel_between_far_stops,A transit vehicle moves too fast between two far stops.,v5.0.0,WARNING +feed_expiration_date30_days,Dataset should cover at least the next 30 days of service.,v5.0.0,WARNING +feed_expiration_date7_days,Dataset should be valid for at least the next 7 days.,v5.0.0,WARNING +feed_info_lang_and_agency_lang_mismatch,Mismatching feed and agency language fields.,v5.0.0,WARNING +foreign_key_violation,Wrong foreign key.,v5.0.0,ERROR +i_o_error,Error in IO operation.,v5.0.0,ERROR +inconsistent_agency_lang,Inconsistent language among agencies.,v5.0.0,WARNING +inconsistent_agency_timezone,Inconsistent Timezone among agencies.,v5.0.0,ERROR +invalid_color,A field contains an invalid color value.,v5.0.0,ERROR +invalid_currency,A field contains a wrong currency code.,v5.0.0,ERROR +invalid_currency_amount,A currency amount field has a value that does not match the format of its corresponding currency code field.,v5.0.0,ERROR +invalid_date,A field cannot be parsed as date.,v5.0.0,ERROR +invalid_email,A field contains a malformed email address.,v5.0.0,ERROR +invalid_float,A field cannot be parsed as a floating point number.,v5.0.0,ERROR +invalid_input_files_in_subfolder,At least 1 GTFS file is in a subfolder.,v5.0.0,ERROR +invalid_integer,A field cannot be parsed as an integer.,v5.0.0,ERROR +invalid_language_code,A field contains a wrong language code.,v5.0.0,ERROR +invalid_phone_number,A field contains a malformed phone number.,v5.0.0,ERROR +invalid_row_length,Invalid csv row length.,v5.0.0,ERROR +invalid_time,A field cannot be parsed as time.,v5.0.0,ERROR +invalid_timezone,A field cannot be parsed as a timezone.,v5.0.0,ERROR +invalid_url,A field contains a malformed URL.,v5.0.0,ERROR +leading_or_trailing_whitespaces,The value in CSV file has leading or trailing whitespaces.,v5.0.0,WARNING +location_with_unexpected_stop_time,A location in `stops.txt` that is not a stop is referenced by some `stop_times.stop_id`.,v5.0.0,ERROR +location_without_parent_station,A location that must have `parent_station` field does not have it.,v5.0.0,ERROR +missing_bike_allowance,Ferry trips should include bike allowance information.,v5.0.0,WARNING +missing_calendar_and_calendar_date_files,Missing GTFS files `calendar.txt` and `calendar_dates.txt`.,v5.0.0,ERROR +missing_feed_contact_email_and_url,Best Practices for `feed_info.txt` suggest providing at least one of `feed_contact_email` and `feed_contact_url`.,v5.0.0,WARNING +missing_feed_info_date,"One of `feed_start_date` or `feed_end_date` is specified, but not both.",v5.0.0,WARNING +missing_level_id,`stops.level_id` is conditionally required.,v5.0.0,ERROR +missing_recommended_column,A recommended column is missing in the input file.,v5.0.0,WARNING +missing_recommended_field,A recommended field is missing.,v5.0.0,WARNING +missing_recommended_file,A recommended file is missing.,v5.0.0,WARNING +missing_required_column,A required column is missing in the input file.,v5.0.0,ERROR +missing_required_field,A required field is missing.,v5.0.0,ERROR +missing_required_file,A required file is missing.,v5.0.0,ERROR +missing_stop_name,"`stops.stop_name` is required for `location_type` equal to `0`, `1`, or `2`.",v5.0.0,ERROR +missing_timepoint_value,`stop_times.timepoint` value is missing for a record.,v5.0.0,WARNING +missing_trip_edge,Missing trip edge `arrival_time` or `departure_time`.,v5.0.0,ERROR +mixed_case_recommended_field,This field has customer-facing text and should use Mixed Case (should contain upper and lower case letters).,v5.0.0,WARNING +more_than_one_entity,More than one row in CSV.,v5.0.0,WARNING +new_line_in_value,New line or carriage return in a value in CSV file.,v5.0.0,ERROR +non_ascii_or_non_printable_char,Non ascii or non printable char in ID field.,v5.0.0,WARNING +number_out_of_range,Out of range value.,v5.0.0,ERROR +overlapping_frequency,Trip frequencies overlap.,v5.0.0,ERROR +pathway_dangling_generic_node,A generic node has only one incident location in a pathway graph.,v5.0.0,WARNING +pathway_loop,A pathway starts and ends at the same location.,v5.0.0,WARNING +pathway_to_platform_with_boarding_areas,A pathway has an endpoint that is a platform which has boarding areas.,v5.0.0,ERROR +pathway_to_wrong_location_type,A pathway has an endpoint that is a station.,v5.0.0,ERROR +pathway_unreachable_location,A location is not reachable at least in one direction: from the entrances or to the exits.,v5.0.0,ERROR +platform_without_parent_station,A platform has no `parent_station` field set.,v5.0.0,INFO +point_near_origin,"A point is too close to origin `(0, 0)`.",v5.0.0,ERROR +point_near_pole,A point is too close to the North or South Pole.,v5.0.0,ERROR +route_both_short_and_long_name_missing,Both `route_short_name` and `route_long_name` are missing for a route.,v5.0.0,ERROR +route_color_contrast,Insufficient route color contrast.,v5.0.0,WARNING +route_long_name_contains_short_name,Long name should not contain short name for a single route.,v5.0.0,WARNING +route_networks_specified_in_more_than_one_file,Indicates that route network identifiers are specified across multiple files.,v5.0.0,ERROR +route_short_name_too_long,Short name of a route is too long (more than 12 characters).,v5.0.0,WARNING +runtime_exception_in_loader_error,RuntimeException while loading GTFS dataset in memory.,v5.0.0,ERROR +runtime_exception_in_validator_error,RuntimeException while validating GTFS archive.,v5.0.0,ERROR +same_name_and_description_for_route,Same name and description for route.,v5.0.0,WARNING +same_name_and_description_for_stop,Same name and description for stop.,v5.0.0,WARNING +same_route_and_agency_url,Same `routes.route_url` and `agency.agency_url`.,v5.0.0,WARNING +same_stop_and_agency_url,Same `stops.stop_url` and `agency.agency_url`.,v5.0.0,WARNING +same_stop_and_route_url,Same `stops.stop_url` and `routes.route_url`.,v5.0.0,WARNING +start_and_end_range_equal,Two date or time fields are equal.,v5.0.0,ERROR +start_and_end_range_out_of_order,Two date or time fields are out of order.,v5.0.0,ERROR +station_with_parent_station,A station has `parent_station` field set.,v5.0.0,ERROR +stop_has_too_many_matches_for_shape,"Stop entry that has many potential matches to the trip's path of travel, as defined by the shape entry in `shapes.txt`.",v5.0.0,WARNING +stop_time_timepoint_without_times,`arrival_time` or `departure_time` not specified for timepoint.,v5.0.0,ERROR +stop_time_with_arrival_before_previous_departure_time,Backwards time travel between stops in `stop_times.txt`,v5.0.0,ERROR +stop_time_with_only_arrival_or_departure_time,Missing `stop_times.arrival_time` or `stop_times.departure_time`.,v5.0.0,ERROR +stop_too_far_from_shape,Stop too far from trip shape.,v5.0.0,WARNING +stop_too_far_from_shape_using_user_distance,Stop time too far from shape.,v5.0.0,WARNING +stop_without_location,"`stop_lat` and/or `stop_lon` is missing for stop with `location_type` equal to`0`, `1`, or `2`",v5.0.0,ERROR +stop_without_stop_time,A stop in `stops.txt` is not referenced by any `stop_times.stop_id`.,v5.0.0,WARNING +stop_without_zone_id,Stop without value for `stops.zone_id` contained in a route with a zone-dependent fare rule.,v5.0.0,INFO +stops_match_shape_out_of_order,Two stop entries are different than their arrival-departure order defined by `shapes.txt`.,v5.0.0,WARNING +thread_execution_error,ExecutionException during multithreaded validation,v5.0.0,ERROR +timeframe_only_start_or_end_time_specified,A row from `timeframes.txt` was found with only one of `start_time` and `end_time` specified.,v5.0.0,ERROR +timeframe_overlap,Two entries in `timeframes.txt` with the same `timeframe_group_id` and `service_id` have overlapping time intervals.,v5.0.0,ERROR +timeframe_start_or_end_time_greater_than_twenty_four_hours,A time in `timeframes.txt` is greater than `24:00:00`.,v5.0.0,ERROR +too_many_rows,A CSV file has too many rows.,v5.0.0,ERROR +transfer_with_invalid_stop_location_type,A stop id field from GTFS file `transfers.txt` references a stop that has a `location_type` other than 0 or 1 (aka Stop/Platform or Station).,v5.0.0,ERROR +transfer_with_invalid_trip_and_route,A trip id field from GTFS file `transfers.txt` references a route that does not match its `trips.txt` `route_id`.,v5.0.0,ERROR +transfer_with_invalid_trip_and_stop,A trip id field from GTFS file `transfers.txt` references a stop that is not included in the referenced trip's stop-times.,v5.0.0,ERROR +transfer_with_suspicious_mid_trip_in_seat,A trip id field from GTFS file `transfers.txt` with an in-seat transfer type references a stop that is not in the expected position in the trip's stop-times.,v5.0.0,WARNING +translation_foreign_key_violation,An entity with the given `record_id` and `record_sub_id` cannot be found in the referenced table.,v5.0.0,ERROR +translation_unexpected_value,A field in a translations row has value but must be empty.,v5.0.0,ERROR +translation_unknown_table_name,A translation references an unknown or missing GTFS table.,v5.0.0,WARNING +trip_coverage_not_active_for_next7_days,Trips data should be valid for at least the next seven days.,v5.0.0,WARNING +trip_distance_exceeds_shape_distance,The distance between the last shape point and last stop point is greater than or equal to the 11.1m threshold.,v5.0.0,ERROR +trip_distance_exceeds_shape_distance_below_threshold,The distance between the last shape point and last stop point is less than the 11.1m threshold.,v5.0.0,WARNING +u_r_i_syntax_error,A string could not be parsed as a URI reference.,v5.0.0,ERROR +unexpected_enum_value,An enum has an unexpected value.,v5.0.0,WARNING +unknown_column,A column name is unknown.,v5.0.0,INFO +unknown_file,A file is unknown.,v5.0.0,INFO +unusable_trip,Trips must have more than one stop to be usable.,v5.0.0,WARNING +unused_parent_station,Unused parent station.,v5.0.0,INFO +unused_shape,Shape is not used in GTFS file `trips.txt`.,v5.0.0,WARNING +unused_trip,Trip is not be used in `stop_times.txt`,v5.0.0,WARNING +wrong_parent_location_type,Incorrect type of the parent location.,v5.0.0,ERROR