Merge pull request #3342 from cal-itp/ntd_additions

Updates sqlfluff version, minor linting fixes, and updates github CI caching
cal-itp · May 16, 2024 · 7c546ca · 7c546ca
2 parents 2b05e85 + 9cd7828
commit 7c546ca
Show file tree

Hide file tree

Showing 13 changed files with 50 additions and 17 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -12,6 +12,12 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2
+      - uses: actions/cache@v3
+        with:
+          path: ~/.cache/pre-commit
+          key: ${{ runner.os }}-pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-pre-commit-
       - uses: pre-commit/[email protected]
       - uses: crate-ci/[email protected] # Set back to `master` after #967 on the typos repo is fixed
         with:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,10 +35,10 @@ repos:
         args: ["-ll", "--skip=B108,B608,B310,B303,B324,B113"]
         files: .py$
   - repo: https://github.com/sqlfluff/sqlfluff
-    rev: 2.1.1
+    rev: 3.0.6
     hooks:
       - id: sqlfluff-lint
-        additional_dependencies: ['dbt-bigquery', 'sqlfluff-templater-dbt']
+        additional_dependencies: ['dbt-bigquery==1.8.1', 'sqlfluff-templater-dbt']
         # skip: L003 indentation stuff -- TODO
         # skip: L010 uppercase keywords -- TODO
         # skip: L011 implicit/explicit aliasing of tables? -- TODO
@@ -55,8 +55,8 @@ repos:
         # skip: joins should not include subqueries -- TODO
         # skip: use left join instead of right join -- TODO
         # skip: use single quotes instead of double -- TODO
-        args: [--dialect, "bigquery", --ignore, "parsing,templating",--exclude-rules, "L003,L010,L011,L022,L036,L038,L039,L059,L016,L029,L027,L032,L034,L014,L042,L055,L064"]
-        files: "warehouse/models" # TODO: should also lint tests etc. but we want to skip packages at least
+        # args: [-vvv, --dialect, "bigquery", --ignore, "parsing,templating",--exclude-rules, "L003,L010,L011,L022,L036,L038,L039,L059,L016,L029,L027,L032,L034,L014,L042,L055,L064"]
+        # files: "warehouse/models" # TODO: should also lint tests etc. but we want to skip packages at least
   - repo: https://github.com/kynan/nbstripout
     rev: 0.6.1
     hooks:

diff --git a/.sqlfluff b/.sqlfluff
@@ -0,0 +1,10 @@
+[sqlfluff]
+dialect = bigquery
+exclude_rules = AL09,L003,L010,L011,L022,L036,L038,L039,L059,L016,L029,L027,L032,L034,L014,L042,L055,L064,AM04,ST09
+ignore = templating, parsing
+
+# Optional: If you have a specific dbt profile you want to use:
+profiles_dir = ./warehouse/
+
+[sqlfluff:templater:dbt]
+project_dir = ./warehouse/models/
diff --git a/.sqlfluffignore b/.sqlfluffignore
@@ -0,0 +1 @@
+warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.sql
diff --git a/airflow/dags/create_external_tables/ntd_data_products/annual_database_service.yml b/airflow/dags/create_external_tables/ntd_data_products/annual_database_service.yml
@@ -0,0 +1,16 @@
+operator: operators.ExternalTable
+bucket: gs://calitp-ntd-data-products
+prefix_bucket: true
+post_hook: |
+  SELECT *
+  FROM `{{ get_project_id() }}`.external_ntd_data_products.annual_database_service
+  LIMIT 1;
+source_objects:
+  - "annual_database_service/*.jsonl.gz"
+destination_project_dataset_table: "external_ntd_data_products.annual_database_service"
+source_format: NEWLINE_DELIMITED_JSON
+use_bq_client: true
+hive_options:
+  mode: CUSTOM
+  require_partition_filter: false
+  source_uri_prefix: "annual-database-service/{dt:DATE}/{ts:TIMESTAMP}/{year:INTEGER}/"
diff --git a/script/scrape_ntd.py b/script/scrape_ntd.py
@@ -18,7 +18,7 @@
 then
 poetry run python scrape_ntd.py annual-database-agency-information 2021 https://www.transit.dot.gov/sites/fta.dot.gov/files/2022-10/2021%20Agency%20Information.xlsx
 poetry run python scrape_ntd.py monthly-ridership-with-adjustments 2024 https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/February%202024%20Complete%20Monthly%20Ridership%20%28with%20adjustments%20and%20estimates%29_240402_0.xlsx
-
+poetry run python scrape_ntd.py annual-database-service 2022 https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/2022%20Service.xlsx
 """
 
 import gzip

diff --git a/.../models/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__feed_aggregator.sql b/.../models/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__feed_aggregator.sql
@@ -40,31 +40,31 @@ int_gtfs_quality__feed_aggregator AS (
                 AND check IN ({{ schedule_feed_on_transitland() }}, {{ schedule_feed_on_mobility_database() }})
                     THEN
                         CASE
-                            WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
+                            WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
                             WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
                             WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
                         END
             WHEN guideline_index.has_rt_url_tu
                 AND check IN ({{ trip_updates_feed_on_transitland() }}, {{ trip_updates_feed_on_mobility_database() }})
                    THEN
                     CASE
-                            WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
+                            WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
                             WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
                             WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
                         END
             WHEN guideline_index.has_rt_url_vp
                 AND check IN ({{ vehicle_positions_feed_on_transitland() }}, {{ vehicle_positions_feed_on_mobility_database() }})
                     THEN
                         CASE
-                            WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
+                            WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
                             WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
                             WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
                         END
             WHEN guideline_index.has_rt_url_sa
                 AND check IN ({{ service_alerts_feed_on_transitland() }}, {{ service_alerts_feed_on_mobility_database() }})
                     THEN
                         CASE
-                            WHEN daily_scraped_urls.aggregator IS NOT null THEN {{ guidelines_pass_status() }}
+                            WHEN daily_scraped_urls.aggregator IS NOT NULL THEN {{ guidelines_pass_status() }}
                             WHEN guideline_index.date < first_check_date THEN {{ guidelines_na_too_early_status() }}
                             WHEN daily_scraped_urls.aggregator IS NULL THEN {{ guidelines_fail_status() }}
                         END

diff --git a/...ouse/models/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__include_tts.sql b/...ouse/models/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__include_tts.sql
@@ -13,7 +13,7 @@ tts_issue_feeds AS (
    SELECT
        feed_key,
            -- When there is no tts_stop_name field for a given stop_name, or tts_stop_name is identical to stop_name, we proceed to run a few tests
-        COUNTIF((tts_stop_name IS null OR tts_stop_name = stop_name)
+        COUNTIF((tts_stop_name IS NULL OR tts_stop_name = stop_name)
          AND (
             -- Test 1: check for abbreviations that need to be spelled out, including directions (n, sb) and ROW types (st, rd)
             ---- EXISTS function returns true if the stop_name contains any of the listed "no-no words", and false if not

diff --git a/warehouse/models/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__lead_time.sql b/warehouse/models/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__lead_time.sql
@@ -47,7 +47,7 @@ scheduled_trips_version_history AS (
       INNER JOIN feed_version_history t3
         ON t3.feed_key = t1.feed_key
     -- Since we are comparing feeds with their previous version, omit the initial version of every feed - no comparison is possible
-     WHERE t3.prev_feed_key IS NOT null
+     WHERE t3.prev_feed_key IS NOT NULL
 ),
 
 -- The self-outer-join, with all of the coalescing, allows us to see:
@@ -83,9 +83,9 @@ improper_trips_updates AS (
   SELECT base64_url,
          feed_key,
           -- A new trip is being added
-         COUNT(CASE WHEN prev_trip_id IS null THEN 1 END) AS trip_added,
+         COUNT(CASE WHEN prev_trip_id IS NULL THEN 1 END) AS trip_added,
           -- An existing trip is being removed
-         COUNT(CASE WHEN trip_id IS null THEN 1 END) AS trip_removed,
+         COUNT(CASE WHEN trip_id IS NULL THEN 1 END) AS trip_removed,
           -- A trip's stop times are being changed
          COUNT(CASE WHEN trip_stop_times_hash != prev_trip_stop_times_hash THEN 1 END) AS stop_times_changed,
           -- A trip's stop location is being changed

diff --git a/...els/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__no_expired_services.sql b/...els/intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__no_expired_services.sql
@@ -45,7 +45,7 @@ daily_earliest_service_expiration AS (
         feed_key,
         MIN(service_end_date) AS earliest_service_end_date
    FROM feed_service_expiration
-  WHERE service_id IS NOT null
+  WHERE service_id IS NOT NULL
   GROUP BY 1
 ),
 

diff --git a/.../intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__passes_fares_validator.sql b/.../intermediate/gtfs_quality/guidelines_checks/int_gtfs_quality__passes_fares_validator.sql
@@ -30,7 +30,7 @@ feed_has_fares AS (
                              )) AS has_fares
       FROM files
      WHERE parse_success
-       AND feed_key IS NOT null
+       AND feed_key IS NOT NULL
      GROUP BY feed_key
 ),
 

diff --git a/...house/models/intermediate/payments/int_payments__latest_authorisations_by_aggregation.sql b/...house/models/intermediate/payments/int_payments__latest_authorisations_by_aggregation.sql
@@ -46,7 +46,7 @@ final_update_with_status AS (
 join_with_flag AS (
     SELECT
         final_update_with_status._payments_key,
-        COALESCE(final_update_without_status.aggregation_id IS NOT NULL, False) AS final_authorisation_has_null_status
+        COALESCE(final_update_without_status.aggregation_id IS NOT NULL, FALSE) AS final_authorisation_has_null_status
     FROM final_update_with_status
     LEFT OUTER JOIN final_update_without_status
         ON final_update_with_status.aggregation_id = final_update_without_status.aggregation_id

diff --git a/warehouse/tests/mart/gtfs/validate_fct_daily_tables.sql b/warehouse/tests/mart/gtfs/validate_fct_daily_tables.sql
@@ -20,7 +20,7 @@ check_cts AS (
     FROM trip_check_cts
     LEFT JOIN stop_check_cts
         ON trip_check_cts.service_date = stop_check_cts.service_date
-    ORDER BY service_date DESC
+    ORDER BY service_date DESC --noqa: AM06
 )
 
 SELECT *
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.sql