Merge branch 'main' into infra-code

HTTPArchive · Oct 14, 2024 · d05ee3c · d05ee3c
2 parents e1b244c + 1372eae
commit d05ee3c
Show file tree

Hide file tree

Showing 34 changed files with 7,753 additions and 3,186 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.editorconfig b/.editorconfig
@@ -8,5 +8,5 @@ charset = utf-8
 trim_trailing_whitespace = true
 insert_final_newline = true
 
-[*.{md,js,sql,yaml}]
+[*.{js,json,md,sql,yaml}]
 indent_size = 2
diff --git a/.github/workflows/linter.yaml b/.github/workflows/linter.yaml
@@ -0,0 +1,34 @@
+name: Linter
+
+on:
+  workflow_dispatch:
+  pull_request: null
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+  packages: read
+
+jobs:
+  build:
+    name: Lint
+    runs-on: ubuntu-latest
+
+    permissions:
+      statuses: write # To report GitHub Actions status checks
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Lint Code Base
+        uses: super-linter/super-linter/[email protected]
+        env:
+          DEFAULT_BRANCH: main
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          VALIDATE_JAVASCRIPT_PRETTIER: false
+          VALIDATE_MARKDOWN_PRETTIER: false
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,4 @@ node_modules/
 
 # Terraform
 tf/.terraform/
-tf/temp
-
-# Dataform
-.df-credentials.json
+tf/temp
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # HTTP Archive datasets pipeline
 
-This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.
+This repository handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.
 
 ## Pipelines
 
@@ -72,8 +72,10 @@ In order to unify the workflow triggering mechanism, we use [a Cloud Run functio
 ### Dataform development workspace hints
 
 1. In workflow settings vars:
-   1. set `env_name: dev` to process sampled data in dev workspace.
-   2. change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.
+
+    - set `env_name: dev` to process sampled data in dev workspace.
+    - change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.
+
 2. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default.
 
 ### Error Monitoring

diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js
@@ -1,18 +1,18 @@
-const date = constants.current_month;
-
-var resources_list = [{
-    datasetId: "all",
-    tableId: "pages"
-  },
-  {
-    datasetId: "all",
-    tableId: "requests"
-  },
-  //{datasetId: "all", tableId: "parsed_css"},
-  //{datasetId: "core_web_vitals", tableId: "technologies"},
-];
-
-resources_list.forEach(resource => {
+const date = constants.currentMonth
+
+const resourcesList = [{
+  datasetId: 'all',
+  tableId: 'pages'
+},
+{
+  datasetId: 'all',
+  tableId: 'requests'
+}
+  // {datasetId: 'all', tableId: 'parsed_css'},
+  // {datasetId: 'core_web_vitals', tableId: 'technologies'},
+]
+
+resourcesList.forEach(resource => {
   operate(
     `test_table ${resource.datasetId}_${resource.tableId}`, {
       hasOutput: true
@@ -24,31 +24,31 @@ DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId};
 
 CREATE TABLE IF NOT EXISTS ${resource.datasetId}_dev.dev_${resource.tableId} AS
 SELECT *
-FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.dev_TABLESAMPLE}
+FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.devTABLESAMPLE}
 WHERE date = '${date}'
-  `);
+  `)
 })
 
-operate("test_table blink_features_dev_dev_usage", {
-  hasOutput: true,
+operate('test_table blink_features_dev_dev_usage', {
+  hasOutput: true
 }).queries(`
 CREATE SCHEMA IF NOT EXISTS blink_features_dev;
 
 CREATE TABLE IF NOT EXISTS blink_features_dev.dev_usage AS
 SELECT *
-FROM blink_features.usage ${constants.dev_TABLESAMPLE}
+FROM blink_features.usage ${constants.devTABLESAMPLE}
 WHERE yyyymmdd = '${date}';
 `)
 
-operate("test_table blink_features_dev_dev_features", {
-  hasOutput: true,
+operate('test_table blink_features_dev_dev_features', {
+  hasOutput: true
 }).queries(`
 CREATE SCHEMA IF NOT EXISTS blink_features_dev;
 
 DROP TABLE IF EXISTS blink_features_dev.dev_features;
 
 CREATE TABLE IF NOT EXISTS blink_features_dev.dev_features AS
 SELECT *
-FROM blink_features.features ${constants.dev_TABLESAMPLE}
+FROM blink_features.features ${constants.devTABLESAMPLE}
 WHERE yyyymmdd = DATE '${date}';
 `)
diff --git a/definitions/output/all/pages.js b/definitions/output/all/pages.js
@@ -1,45 +1,45 @@
-publish("pages", {
-  type: "incremental",
+publish('pages', {
+  type: 'incremental',
   protected: true,
-  schema: "all",
+  schema: 'all',
   bigquery: {
-    partitionBy: "date",
-    clusterBy: ["client", "is_root_page", "rank"],
+    partitionBy: 'date',
+    clusterBy: ['client', 'is_root_page', 'rank'],
     requirePartitionFilter: true
   },
-  tags: ["crawl_results_all"],
+  tags: ['crawl_results_all']
 }).preOps(ctx => `
 DELETE FROM ${ctx.self()}
-WHERE date = '${constants.current_month}';
+WHERE date = '${constants.currentMonth}';
 `).query(ctx => `
 SELECT *
-FROM ${ctx.ref("crawl_staging", "pages")}
-WHERE date = '${constants.current_month}'
+FROM ${ctx.ref('crawl_staging', 'pages')}
+WHERE date = '${constants.currentMonth}'
   AND client = 'desktop'
   AND is_root_page = TRUE
-  ${constants.dev_rank_filter}
+  ${constants.devRankFilter}
 `).postOps(ctx => `
 INSERT INTO ${ctx.self()}
 SELECT *
-FROM ${ctx.ref("crawl_staging", "pages")}
-WHERE date = '${constants.current_month}'
+FROM ${ctx.ref('crawl_staging', 'pages')}
+WHERE date = '${constants.currentMonth}'
   AND client = 'desktop'
   AND is_root_page = FALSE
-  ${constants.dev_rank_filter};
+  ${constants.devRankFilter};
 
 INSERT INTO ${ctx.self()}
 SELECT *
-FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE}
-WHERE date = '${constants.current_month}'
+FROM ${ctx.ref('crawl_staging', 'pages')} ${constants.devTABLESAMPLE}
+WHERE date = '${constants.currentMonth}'
   AND client = 'mobile'
   AND is_root_page = TRUE
-  ${constants.dev_rank_filter};
+  ${constants.devRankFilter};
 
 INSERT INTO ${ctx.self()}
 SELECT *
-FROM ${ctx.ref("crawl_staging", "pages")}
-WHERE date = '${constants.current_month}'
+FROM ${ctx.ref('crawl_staging', 'pages')}
+WHERE date = '${constants.currentMonth}'
   AND client = 'mobile'
   AND is_root_page = FALSE
-  ${constants.dev_rank_filter};
+  ${constants.devRankFilter};
 `)
diff --git a/definitions/output/all/parsed_css.js b/definitions/output/all/parsed_css.js
@@ -1,27 +1,27 @@
-publish("parsed_css", {
-  type: "incremental",
+publish('parsed_css', {
+  type: 'incremental',
   protected: true,
-  schema: "all",
+  schema: 'all',
   bigquery: {
-    partitionBy: "date",
-    clusterBy: ["client", "is_root_page", "rank", "page"],
+    partitionBy: 'date',
+    clusterBy: ['client', 'is_root_page', 'rank', 'page'],
     requirePartitionFilter: true
   },
-  tags: ["crawl_results_all"],
+  tags: ['crawl_results_all']
 }).preOps(ctx => `
 DELETE FROM ${ctx.self()}
-WHERE date = '${constants.current_month}';
+WHERE date = '${constants.currentMonth}';
 `).query(ctx => `
 SELECT *
-FROM ${ctx.ref("crawl_staging", "parsed_css")}
-WHERE date = '${constants.current_month}'
+FROM ${ctx.ref('crawl_staging', 'parsed_css')}
+WHERE date = '${constants.currentMonth}'
   AND client = 'desktop'
-  ${constants.dev_rank_filter}
+  ${constants.devRankFilter}
 `).postOps(ctx => `
 INSERT INTO ${ctx.self()}
 SELECT *
-FROM ${ctx.ref("crawl_staging", "parsed_css")}
-WHERE date = '${constants.current_month}'
+FROM ${ctx.ref('crawl_staging', 'parsed_css')}
+WHERE date = '${constants.currentMonth}'
   AND client = 'mobile'
-  ${constants.dev_rank_filter};
+  ${constants.devRankFilter};
 `)