HTTPArchive · max-ostapenko · Dec 9, 2024 · Nov 16, 2024 · Nov 16, 2024 · Nov 16, 2024
diff --git a/.github/workflows/linter.yaml b/.github/workflows/linter.yaml
@@ -33,4 +33,3 @@ jobs:
           VALIDATE_JSCPD: false
           VALIDATE_JAVASCRIPT_PRETTIER: false
           VALIDATE_MARKDOWN_PRETTIER: false
-          VALIDATE_GITHUB_ACTIONS: false
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,5 @@ node_modules/
 
 # Terraform
 infra/tf/.terraform/
+infra/tf/tmp/
 **/*.zip
diff --git a/Makefile b/Makefile
@@ -1,14 +1,7 @@
-FN_NAME = dataform-trigger
-
 .PHONY: *
 
-start:
-	npx functions-framework --target=$(FN_NAME) --source=./infra/dataform-trigger/ --signature-type=http --port=8080 --debug
-
 tf_plan:
-	terraform -chdir=infra/tf init -upgrade && terraform -chdir=infra/tf plan \
-		-var="FUNCTION_NAME=$(FN_NAME)"
+	terraform -chdir=infra/tf init -upgrade && terraform -chdir=infra/tf plan
 
 tf_apply:
-	terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve \
-		-var="FUNCTION_NAME=$(FN_NAME)"
+	terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ Tag: `crawl_complete`
 
 ### Core Web Vitals Technology Report
 
-Tag: `cwv_tech_report`
+Tag: `crux_ready`
 
 - httparchive.core_web_vitals.technologies
 
@@ -26,7 +26,7 @@ Consumers:
 
 ### Blink Features Report
 
-Tag: `blink_features_report`
+Tag: `crawl_complete`
 
 - httparchive.blink_features.features
 - httparchive.blink_features.usage
@@ -35,30 +35,15 @@ Consumers:
 
 - chromestatus.com - [example](https://chromestatus.com/metrics/feature/timeline/popularity/2089)
 
-### Legacy crawl results (to be deprecated)
-
-Tag: `crawl_results_legacy`
-
-- httparchive.all.pages
-- httparchive.all.parsed_css
-- httparchive.all.requests
-- httparchive.lighthouse.YYYY_MM_DD_client
-- httparchive.pages.YYYY_MM_DD_client
-- httparchive.requests.YYYY_MM_DD_client
-- httparchive.response_bodies.YYYY_MM_DD_client
-- httparchive.summary_pages.YYYY_MM_DD_client
-- httparchive.summary_requests.YYYY_MM_DD_client
-- httparchive.technologies.YYYY_MM_DD_client
-
 ## Schedules
 
 1. [crawl-complete](https://console.cloud.google.com/cloudpubsub/subscription/detail/dataformTrigger?authuser=7&project=httparchive) PubSub subscription
 
-    Tags: ["crawl_complete", "blink_features_report", "crawl_results_legacy"]
+    Tags: ["crawl_complete"]
 
 2. [bq-poller-cwv-tech-report](https://console.cloud.google.com/cloudscheduler/jobs/edit/us-east4/bq-poller-cwv-tech-report?authuser=7&project=httparchive) Scheduler
 
-    Tags: ["cwv_tech_report"]
+    Tags: ["crux_ready"]
 
 ### Triggering workflows
 
@@ -72,20 +57,7 @@ In order to unify the workflow triggering mechanism, we use [a Cloud Run functio
 2. Make adjustments to the dataform configuration files and manually run a workflow to verify.
 3. Push all your changes to a dev branch & open a PR with the link to the BigQuery artifacts generated in the test workflow.
 
-### Dataform development workspace hints
-
-1. In workflow settings vars:
-
-    - set `env_name: dev` to process sampled data in dev workspace.
-    - change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.
-
-2. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default.
-
-### Error Monitoring
-
-The issues within the pipeline are being tracked using the following alerts:
-
-1. the event trigger processing fails - [Dataform Trigger Function Error](https://console.cloud.google.com/monitoring/alerting/policies/570799173843203905?authuser=7&project=httparchive)
-2. a job in the workflow fails - "[Dataform Workflow Invocation Failed](https://console.cloud.google.com/monitoring/alerting/policies/16526940745374967367?authuser=7&project=httparchive)
+#### Workspace hints
 
-Error notifications are sent to [#10x-infra](https://httparchive.slack.com/archives/C030V4WAVL3) Slack channel.
+1. In `workflow_settings.yaml` set `env_name: dev` to process sampled data.
+2. In `includes/constants.js` set `today` or other variables to a custome value.
diff --git a/definitions/sources/chrome-ux-report.js → definitions/declarations/chrome-ux-report.js b/definitions/sources/chrome-ux-report.js → definitions/declarations/chrome-ux-report.js
diff --git a/definitions/sources/httparchive.js → definitions/declarations/httparchive.js b/definitions/sources/httparchive.js → definitions/declarations/httparchive.js
@@ -5,3 +5,8 @@ for (const table of stagingTables) {
     name: table
   })
 }
+
+declare({
+  schema: 'wappalyzer',
+  name: 'apps'
+})
diff --git a/definitions/output/blink_features/features.js b/definitions/output/blink_features/features.js
@@ -6,7 +6,7 @@ publish('features', {
     partitionBy: 'yyyymmdd',
     clusterBy: ['client', 'rank']
   },
-  tags: ['blink_features_report']
+  tags: ['crawl_complete']
 }).preOps(ctx => `
 DELETE FROM ${ctx.self()}
 WHERE yyyymmdd = DATE '${constants.currentMonth}';

diff --git a/definitions/output/blink_features/usage.js b/definitions/output/blink_features/usage.js
@@ -2,7 +2,7 @@ publish('usage', {
   schema: 'blink_features',
   type: 'incremental',
   protected: true,
-  tags: ['blink_features_report']
+  tags: ['crawl_complete']
 }).preOps(ctx => `
 DELETE FROM ${ctx.self()}
 WHERE yyyymmdd = REPLACE('${constants.currentMonth}', '-', '');

diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js
@@ -9,17 +9,25 @@ publish('technologies', {
     clusterBy: ['geo', 'app', 'rank', 'client'],
     requirePartitionFilter: true
   },
-  tags: ['cwv_tech_report'],
+  tags: ['crux_ready'],
   dependOnDependencyAssertions: true
 }).preOps(ctx => `
 DELETE FROM ${ctx.self()}
 WHERE date = '${pastMonth}';
 
-CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
+CREATE TEMP FUNCTION IS_GOOD(
+  good FLOAT64,
+  needs_improvement FLOAT64,
+  poor FLOAT64
+) RETURNS BOOL AS (
   SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75
 );
 
-CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
+CREATE TEMP FUNCTION IS_NON_ZERO(
+  good FLOAT64,
+  needs_improvement FLOAT64,
+  poor FLOAT64
+) RETURNS BOOL AS (
   good + needs_improvement + poor > 0
 );
 `).query(ctx => `
@@ -28,17 +36,15 @@ WITH geo_summary AS (
     CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), r'(\\d{4})(\\d{2})', r'\\1-\\2-01') AS DATE) AS date,
     * EXCEPT (country_code),
     \`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo
-  FROM
-    ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')}
+  FROM ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')}
   WHERE
     yyyymm = CAST(FORMAT_DATE('%Y%m', '${pastMonth}') AS INT64) AND
     device IN ('desktop', 'phone')
 UNION ALL
   SELECT
     * EXCEPT (yyyymmdd, p75_fid_origin, p75_cls_origin, p75_lcp_origin, p75_inp_origin),
     'ALL' AS geo
-  FROM
-    ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')}
+  FROM ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')}
   WHERE
     date = '${pastMonth}' AND
     device IN ('desktop', 'phone')
@@ -81,20 +87,17 @@ crux AS (
     IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb,
     IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp,
     IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp
-  FROM
-    geo_summary,
+  FROM geo_summary,
     UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank
-  WHERE
-    rank <= _rank
+  WHERE rank <= _rank
 ),
 
 technologies AS (
   SELECT
     technology.technology AS app,
     client,
     page AS url
-  FROM
-    ${ctx.ref('crawl', 'pages')},
+  FROM ${ctx.ref('crawl', 'pages')},
     UNNEST(technologies) AS technology
   WHERE
     date = '${pastMonth}'
@@ -106,8 +109,7 @@ UNION ALL
     'ALL' AS app,
     client,
     page AS url
-  FROM
-    ${ctx.ref('crawl', 'pages')}
+  FROM ${ctx.ref('crawl', 'pages')}
   WHERE
     date = '${pastMonth}'
     ${constants.devRankFilter}
@@ -117,21 +119,18 @@ categories AS (
   SELECT
     technology.technology AS app,
     ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
-  FROM
-    ${ctx.ref('crawl', 'pages')},
+  FROM ${ctx.ref('crawl', 'pages')},
     UNNEST(technologies) AS technology,
     UNNEST(technology.categories) AS category
   WHERE
     date = '${pastMonth}'
     ${constants.devRankFilter}
-  GROUP BY
-    app
+  GROUP BY app
 UNION ALL
   SELECT
     'ALL' AS app,
     ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
-  FROM
-    ${ctx.ref('crawl', 'pages')},
+  FROM ${ctx.ref('crawl', 'pages')},
     UNNEST(technologies) AS technology,
     UNNEST(technology.categories) AS category
   WHERE
@@ -153,8 +152,7 @@ summary_stats AS (
     SAFE.FLOAT64(lighthouse.categories.performance.score) AS performance,
     SAFE.FLOAT64(lighthouse.categories.pwa.score) AS pwa,
     SAFE.FLOAT64(lighthouse.categories.seo.score) AS seo
-  FROM
-    ${ctx.ref('crawl', 'pages')}
+  FROM ${ctx.ref('crawl', 'pages')}
   WHERE
     date = '${pastMonth}'
     ${constants.devRankFilter}
@@ -174,16 +172,11 @@ lab_data AS (
     AVG(performance) AS performance,
     AVG(pwa) AS pwa,
     AVG(seo) AS seo
-  FROM
-    summary_stats
-  JOIN
-    technologies
-  USING
-    (client, url)
-  JOIN
-    categories
-  USING
-    (app)
+  FROM summary_stats
+  JOIN technologies
+  USING (client, url)
+  JOIN categories
+  USING (app)
   GROUP BY
     client,
     root_page_url,
@@ -232,10 +225,8 @@ SELECT
   SAFE_CAST(APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS INT64) AS median_bytes_js,
   SAFE_CAST(APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS INT64) AS median_bytes_image
 
-FROM
-  lab_data
-JOIN
-  crux
+FROM lab_data
+JOIN crux
 USING
   (client, root_page_url)
 GROUP BY

diff --git a/definitions/output/reports/cwv_tech_adoption.js b/definitions/output/reports/cwv_tech_adoption.js
@@ -0,0 +1,49 @@
+const pastMonth = constants.fnPastMonth(constants.currentMonth)
+
+publish('cwv_tech_adoption', {
+  schema: 'reports',
+  type: 'incremental',
+  protected: true,
+  bigquery: {
+    partitionBy: 'date',
+    clusterBy: ['rank', 'geo']
+  },
+  tags: ['crux_ready']
+}).preOps(ctx => `
+CREATE TEMPORARY FUNCTION GET_ADOPTION(
+  records ARRAY<STRUCT<
+    client STRING,
+    origins INT64
+>>)
+RETURNS STRUCT<
+  desktop INT64,
+  mobile INT64
+>
+LANGUAGE js AS '''
+return Object.fromEntries(
+  records.map(({client, origins}) => {
+    return [client, origins]
+}))
+''';
+
+DELETE FROM ${ctx.self()}
+WHERE date = '${pastMonth}';
+`).query(ctx => `
+/* {"dataform_trigger": "report_cwv_tech_complete", "date": "${pastMonth}", "name": "adoption", "type": "report"} */
+SELECT
+  date,
+  app AS technology,
+  rank,
+  geo,
+  GET_ADOPTION(ARRAY_AGG(STRUCT(
+    client,
+    origins
+  ))) AS adoption
+FROM ${ctx.ref('core_web_vitals', 'technologies')}
+WHERE date = '${pastMonth}'
+GROUP BY
+  date,
+  app,
+  rank,
+  geo
+`)
diff --git a/definitions/output/reports/cwv_tech_categories.js b/definitions/output/reports/cwv_tech_categories.js
@@ -0,0 +1,51 @@
+const pastMonth = constants.fnPastMonth(constants.currentMonth)
+
+publish('cwv_tech_categories', {
+  schema: 'reports',
+  type: 'table',
+  tags: ['crux_ready']
+}).query(ctx => `
+/* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */
+WITH pages AS (
+  SELECT
+    root_page,
+    technologies
+  FROM ${ctx.ref('crawl', 'pages')}
+  WHERE
+    date = '${pastMonth}' AND
+    client = 'mobile'
+    ${constants.devRankFilter}
+),categories AS (
+  SELECT
+    category,
+    COUNT(DISTINCT root_page) AS origins
+  FROM pages,
+    UNNEST(technologies) AS t,
+    UNNEST(t.categories) AS category
+  GROUP BY category
+),
+technologies AS (
+  SELECT
+    category,
+    technology,
+    COUNT(DISTINCT root_page) AS origins
+  FROM pages,
+    UNNEST(technologies) AS t,
+    UNNEST(t.categories) AS category
+  GROUP BY
+    category,
+    technology
+)
+
+SELECT
+  category,
+  categories.origins,
+  ARRAY_AGG(technology ORDER BY technologies.origins DESC) AS technologies
+FROM categories
+JOIN technologies
+USING (category)
+GROUP BY
+  category,
+  categories.origins
+ORDER BY categories.origins DESC
+`)