Skip to content

Commit

Permalink
Merge pull request #3342 from cal-itp/ntd_additions
Browse files Browse the repository at this point in the history
Updates sqlfluff version, minor linting fixes, and updates github CI caching
  • Loading branch information
vevetron authored May 16, 2024
2 parents 2b05e85 + 9cd7828 commit 7c546ca
Show file tree
Hide file tree
Showing 13 changed files with 50 additions and 17 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ jobs:
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: actions/cache@v3
with:
path: ~/.cache/pre-commit
key: ${{ runner.os }}-pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}
restore-keys: |
${{ runner.os }}-pre-commit-
- uses: pre-commit/[email protected]
- uses: crate-ci/[email protected] # Set back to `master` after #967 on the typos repo is fixed
with:
Expand Down
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ repos:
args: ["-ll", "--skip=B108,B608,B310,B303,B324,B113"]
files: .py$
- repo: https://github.com/sqlfluff/sqlfluff
rev: 2.1.1
rev: 3.0.6
hooks:
- id: sqlfluff-lint
additional_dependencies: ['dbt-bigquery', 'sqlfluff-templater-dbt']
additional_dependencies: ['dbt-bigquery==1.8.1', 'sqlfluff-templater-dbt']
# skip: L003 indentation stuff -- TODO
# skip: L010 uppercase keywords -- TODO
# skip: L011 implicit/explicit aliasing of tables? -- TODO
Expand All @@ -55,8 +55,8 @@ repos:
# skip: joins should not include subqueries -- TODO
# skip: use left join instead of right join -- TODO
# skip: use single quotes instead of double -- TODO
args: [--dialect, "bigquery", --ignore, "parsing,templating",--exclude-rules, "L003,L010,L011,L022,L036,L038,L039,L059,L016,L029,L027,L032,L034,L014,L042,L055,L064"]
files: "warehouse/models" # TODO: should also lint tests etc. but we want to skip packages at least
# args: [-vvv, --dialect, "bigquery", --ignore, "parsing,templating",--exclude-rules, "L003,L010,L011,L022,L036,L038,L039,L059,L016,L029,L027,L032,L034,L014,L042,L055,L064"]
# files: "warehouse/models" # TODO: should also lint tests etc. but we want to skip packages at least
- repo: https://github.com/kynan/nbstripout
rev: 0.6.1
hooks:
Expand Down
10 changes: 10 additions & 0 deletions .sqlfluff
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[sqlfluff]
dialect = bigquery
exclude_rules = AL09,L003,L010,L011,L022,L036,L038,L039,L059,L016,L029,L027,L032,L034,L014,L042,L055,L064,AM04,ST09
ignore = templating, parsing

# Optional: If you have a specific dbt profile you want to use:
profiles_dir = ./warehouse/

[sqlfluff:templater:dbt]
project_dir = ./warehouse/models/
1 change: 1 addition & 0 deletions .sqlfluffignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
operator: operators.ExternalTable
bucket: gs://calitp-ntd-data-products
prefix_bucket: true
post_hook: |
SELECT *
FROM `{{ get_project_id() }}`.external_ntd_data_products.annual_database_service
LIMIT 1;
source_objects:
- "annual_database_service/*.jsonl.gz"
destination_project_dataset_table: "external_ntd_data_products.annual_database_service"
source_format: NEWLINE_DELIMITED_JSON
use_bq_client: true
hive_options:
mode: CUSTOM
require_partition_filter: false
source_uri_prefix: "annual-database-service/{dt:DATE}/{ts:TIMESTAMP}/{year:INTEGER}/"
2 changes: 1 addition & 1 deletion script/scrape_ntd.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
then
poetry run python scrape_ntd.py annual-database-agency-information 2021 https://www.transit.dot.gov/sites/fta.dot.gov/files/2022-10/2021%20Agency%20Information.xlsx
poetry run python scrape_ntd.py monthly-ridership-with-adjustments 2024 https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/February%202024%20Complete%20Monthly%20Ridership%20%28with%20adjustments%20and%20estimates%29_240402_0.xlsx
poetry run python scrape_ntd.py annual-database-service 2022 https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/2022%20Service.xlsx
"""

import gzip
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,31 @@ int_gtfs_quality__feed_aggregator AS (
AND check IN ({{ schedule_feed_on_transitland() }}, {{ schedule_feed_on_mobility_database() }})
THEN
CASE
WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
END
WHEN guideline_index.has_rt_url_tu
AND check IN ({{ trip_updates_feed_on_transitland() }}, {{ trip_updates_feed_on_mobility_database() }})
THEN
CASE
WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
END
WHEN guideline_index.has_rt_url_vp
AND check IN ({{ vehicle_positions_feed_on_transitland() }}, {{ vehicle_positions_feed_on_mobility_database() }})
THEN
CASE
WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
END
WHEN guideline_index.has_rt_url_sa
AND check IN ({{ service_alerts_feed_on_transitland() }}, {{ service_alerts_feed_on_mobility_database() }})
THEN
CASE
WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
END
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ tts_issue_feeds AS (
SELECT
feed_key,
-- When there is no tts_stop_name field for a given stop_name, or tts_stop_name is identical to stop_name, we proceed to run a few tests
COUNTIF((tts_stop_name IS null OR tts_stop_name = stop_name)
COUNTIF((tts_stop_name IS NULL OR tts_stop_name = stop_name)
AND (
-- Test 1: check for abbreviations that need to be spelled out, including directions (n, sb) and ROW types (st, rd)
---- EXISTS function returns true if the stop_name contains any of the listed "no-no words", and false if not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ scheduled_trips_version_history AS (
INNER JOIN feed_version_history t3
ON t3.feed_key = t1.feed_key
-- Since we are comparing feeds with their previous version, omit the initial version of every feed - no comparison is possible
WHERE t3.prev_feed_key IS NOT null
WHERE t3.prev_feed_key IS NOT NULL
),

-- The self-outer-join, with all of the coalescing, allows us to see:
Expand Down Expand Up @@ -83,9 +83,9 @@ improper_trips_updates AS (
SELECT base64_url,
feed_key,
-- A new trip is being added
COUNT(CASE WHEN prev_trip_id IS null THEN 1 END) AS trip_added,
COUNT(CASE WHEN prev_trip_id IS NULL THEN 1 END) AS trip_added,
-- An existing trip is being removed
COUNT(CASE WHEN trip_id IS null THEN 1 END) AS trip_removed,
COUNT(CASE WHEN trip_id IS NULL THEN 1 END) AS trip_removed,
-- A trip's stop times are being changed
COUNT(CASE WHEN trip_stop_times_hash != prev_trip_stop_times_hash THEN 1 END) AS stop_times_changed,
-- A trip's stop location is being changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ daily_earliest_service_expiration AS (
feed_key,
MIN(service_end_date) AS earliest_service_end_date
FROM feed_service_expiration
WHERE service_id IS NOT null
WHERE service_id IS NOT NULL
GROUP BY 1
),

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ feed_has_fares AS (
)) AS has_fares
FROM files
WHERE parse_success
AND feed_key IS NOT null
AND feed_key IS NOT NULL
GROUP BY feed_key
),

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ final_update_with_status AS (
join_with_flag AS (
SELECT
final_update_with_status._payments_key,
COALESCE(final_update_without_status.aggregation_id IS NOT NULL, False) AS final_authorisation_has_null_status
COALESCE(final_update_without_status.aggregation_id IS NOT NULL, FALSE) AS final_authorisation_has_null_status
FROM final_update_with_status
LEFT OUTER JOIN final_update_without_status
ON final_update_with_status.aggregation_id = final_update_without_status.aggregation_id
Expand Down
2 changes: 1 addition & 1 deletion warehouse/tests/mart/gtfs/validate_fct_daily_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ check_cts AS (
FROM trip_check_cts
LEFT JOIN stop_check_cts
ON trip_check_cts.service_date = stop_check_cts.service_date
ORDER BY service_date DESC
ORDER BY service_date DESC --noqa: AM06
)

SELECT *
Expand Down

0 comments on commit 7c546ca

Please sign in to comment.