From 65f098cc029cdb50ff162382b584d6799d062d7f Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 8 Oct 2024 20:38:49 +0200 Subject: [PATCH] Blink Features report (#11) * blink features report * tested * no reference * formatted date * Update README.md Co-authored-by: Barry Pollard --------- Co-authored-by: Barry Pollard --- README.md | 50 ++++++++------ definitions/extra/test_env.js | 37 +++++++--- definitions/output/all/pages.js | 26 +++++-- definitions/output/all/parsed_css.js | 12 ++-- definitions/output/blink_features/features.js | 69 +++++++++++++++++++ definitions/output/blink_features/usage.js | 55 +++++++++++++++ .../output/core_web_vitals/technologies.js | 15 ++-- definitions/output/technologies.js | 3 +- includes/constants.js | 2 +- src/index.js | 2 +- 10 files changed, 224 insertions(+), 47 deletions(-) create mode 100644 definitions/output/blink_features/features.js create mode 100644 definitions/output/blink_features/usage.js diff --git a/README.md b/README.md index e2a735e..0cd64e7 100644 --- a/README.md +++ b/README.md @@ -6,46 +6,56 @@ This repo handles the HTTP Archive data pipeline, which takes the results of the The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run. -### Crawl tables in `all` dataset +### Crawl results Tag: `crawl_results_all` -- [x] httparchive.all.pages -- [x] httparchive.all.parsed_css -- [x] httparchive.all.requests +- httparchive.all.pages +- httparchive.all.parsed_css +- httparchive.all.requests ### Core Web Vitals Technology Report Tag: `cwv_tech_report` -- [x] httparchive.core_web_vitals.technologies +- httparchive.core_web_vitals.technologies -### Legacy crawl tables (to be deprecated) +Consumers: + +- [HTTP Archive Tech Report](https://httparchive.org/reports/techreport/landing) + +### Blink Features Report + +Tag: `blink_features_report` + +- httparchive.blink_features.features +- httparchive.blink_features.usage + +Consumers: + +- chromestatus.com - [example](https://chromestatus.com/metrics/feature/timeline/popularity/2089) + +### Legacy crawl results (to be deprecated) Tag: `crawl_results_legacy` -- [x] httparchive.lighthouse.YYYY_MM_DD_client -- [x] httparchive.pages.YYYY_MM_DD_client -- [x] httparchive.requests.YYYY_MM_DD_client -- [x] httparchive.response_bodies.YYYY_MM_DD_client -- [x] httparchive.summary_pages.YYYY_MM_DD_client -- [x] httparchive.summary_requests.YYYY_MM_DD_client -- [x] httparchive.technologies.YYYY_MM_DD_client +- httparchive.lighthouse.YYYY_MM_DD_client +- httparchive.pages.YYYY_MM_DD_client +- httparchive.requests.YYYY_MM_DD_client +- httparchive.response_bodies.YYYY_MM_DD_client +- httparchive.summary_pages.YYYY_MM_DD_client +- httparchive.summary_requests.YYYY_MM_DD_client +- httparchive.technologies.YYYY_MM_DD_client ## Schedules 1. [crawl-complete](https://console.cloud.google.com/cloudpubsub/subscription/detail/dataformTrigger?authuser=7&project=httparchive) PubSub subscription - Tags: - - - crawl_results_all - - crawl_results_legacy + Tags: ["crawl_results_all", "blink_features_report", "crawl_results_legacy"] 2. [bq-poller-cwv-tech-report](https://console.cloud.google.com/cloudscheduler/jobs/edit/us-east4/bq-poller-cwv-tech-report?authuser=7&project=httparchive) Scheduler - Tags: - - - cwv_tech_report + Tags: ["cwv_tech_report"] ### Triggering workflows diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index 3df7259..e9bf47c 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,4 +1,4 @@ -const date = constants.fn_past_month(constants.current_month); +const date = constants.current_month; var resources_list = [ { datasetId: "all", tableId: "pages" }, @@ -11,17 +11,38 @@ resources_list.forEach(resource => { operate( `test_table ${resource.datasetId}_${resource.tableId}`, { hasOutput: true } - ).queries(ctx => ` + ).queries(` CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev; -DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId}; +DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId}; -CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId} -LIKE httparchive.${resource.datasetId}.${resource.tableId}; - -INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId} +CREATE TABLE IF NOT EXISTS ${resource.datasetId}_dev.dev_${resource.tableId} AS SELECT * -FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE} +FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.dev_TABLESAMPLE} WHERE date = '${date}' `); }) + +operate("test_table blink_features_dev_dev_usage", { + hasOutput: true, +}).queries(` +CREATE SCHEMA IF NOT EXISTS blink_features_dev; + +CREATE TABLE IF NOT EXISTS blink_features_dev.dev_usage AS +SELECT * +FROM blink_features.usage ${constants.dev_TABLESAMPLE} +WHERE yyyymmdd = '${date}'; +`) + +operate("test_table blink_features_dev_dev_features", { + hasOutput: true, +}).queries(` +CREATE SCHEMA IF NOT EXISTS blink_features_dev; + +DROP TABLE IF EXISTS blink_features_dev.dev_features; + +CREATE TABLE IF NOT EXISTS blink_features_dev.dev_features AS +SELECT * +FROM blink_features.features ${constants.dev_TABLESAMPLE} +WHERE yyyymmdd = DATE '${date}'; +`) \ No newline at end of file diff --git a/definitions/output/all/pages.js b/definitions/output/all/pages.js index 3c71fea..3545ca9 100644 --- a/definitions/output/all/pages.js +++ b/definitions/output/all/pages.js @@ -13,21 +13,33 @@ DELETE FROM ${ctx.self()} WHERE date = '${constants.current_month}'; `).query(ctx => ` SELECT * -FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE} -WHERE date = '${constants.current_month}' AND client = 'desktop' AND is_root_page = TRUE +FROM ${ctx.ref("crawl_staging", "pages")} +WHERE date = '${constants.current_month}' + AND client = 'desktop' + AND is_root_page = TRUE + ${constants.dev_rank_filter} `).postOps(ctx => ` INSERT INTO ${ctx.self()} SELECT * -FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE} -WHERE date = '${constants.current_month}' AND client = 'desktop' AND is_root_page = FALSE; +FROM ${ctx.ref("crawl_staging", "pages")} +WHERE date = '${constants.current_month}' + AND client = 'desktop' + AND is_root_page = FALSE + ${constants.dev_rank_filter}; INSERT INTO ${ctx.self()} SELECT * FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE} -WHERE date = '${constants.current_month}' AND client = 'mobile' AND is_root_page = TRUE; +WHERE date = '${constants.current_month}' + AND client = 'mobile' + AND is_root_page = TRUE + ${constants.dev_rank_filter}; INSERT INTO ${ctx.self()} SELECT * -FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE} -WHERE date = '${constants.current_month}' AND client = 'mobile' AND is_root_page = FALSE +FROM ${ctx.ref("crawl_staging", "pages")} +WHERE date = '${constants.current_month}' + AND client = 'mobile' + AND is_root_page = FALSE + ${constants.dev_rank_filter}; `) diff --git a/definitions/output/all/parsed_css.js b/definitions/output/all/parsed_css.js index 55a686b..08e098b 100644 --- a/definitions/output/all/parsed_css.js +++ b/definitions/output/all/parsed_css.js @@ -13,11 +13,15 @@ DELETE FROM ${ctx.self()} WHERE date = '${constants.current_month}'; `).query(ctx => ` SELECT * -FROM ${ctx.ref("crawl_staging", "parsed_css")} ${constants.dev_TABLESAMPLE} -WHERE date = '${constants.current_month}' AND client = 'desktop' +FROM ${ctx.ref("crawl_staging", "parsed_css")} +WHERE date = '${constants.current_month}' + AND client = 'desktop' + ${constants.dev_rank_filter} `).postOps(ctx => ` INSERT INTO ${ctx.self()} SELECT * -FROM ${ctx.ref("crawl_staging", "parsed_css")} ${constants.dev_TABLESAMPLE} -WHERE date = '${constants.current_month}' AND client = 'mobile' +FROM ${ctx.ref("crawl_staging", "parsed_css")} +WHERE date = '${constants.current_month}' + AND client = 'mobile' + ${constants.dev_rank_filter}; `) diff --git a/definitions/output/blink_features/features.js b/definitions/output/blink_features/features.js new file mode 100644 index 0000000..010c65b --- /dev/null +++ b/definitions/output/blink_features/features.js @@ -0,0 +1,69 @@ + +publish("features", { + schema: "blink_features", + type: "incremental", + protected: true, + bigquery: { + partitionBy: "yyyymmdd", + clusterBy: ["client", "rank"] + }, + tags: ["blink_features_report"] +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE yyyymmdd = DATE '${constants.current_month}'; + +CREATE TEMP FUNCTION features(payload STRING) +RETURNS ARRAY> LANGUAGE js AS +""" +function getFeatureNames(featureMap, featureType) { + try { + return Object.entries(featureMap).map(([key, value]) => { + // After Feb 2020 keys are feature IDs. + if (value.name) { + return {'name': value.name, 'type': featureType, 'id': key}; + } + // Prior to Feb 2020 keys fell back to IDs if the name was unknown. + if (idPattern.test(key)) { + return {'name': '', 'type': featureType, 'id': key.match(idPattern)[1]}; + } + // Prior to Feb 2020 keys were names by default. + return {'name': key, 'type': featureType, 'id': ''}; + }); + } catch (e) { + return []; + } +} + +var $ = JSON.parse(payload); +if (!$._blinkFeatureFirstUsed) return []; + +var idPattern = new RegExp('^Feature_(\d+)$'); +return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default') + .concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css')) + .concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); +"""; +`).query(ctx => ` +SELECT + date AS yyyymmdd, + client, + url, + feature.feature AS feature, + feature.type, + feature.id, + rank +FROM ( + SELECT + date, + client, + page AS url, + payload, + rank, + feature + FROM ${ctx.ref("all", "pages")}, + UNNEST(features) AS feature + WHERE + date = '${constants.current_month}' AND + is_root_page = TRUE + ${constants.dev_rank_filter} +) +`) diff --git a/definitions/output/blink_features/usage.js b/definitions/output/blink_features/usage.js new file mode 100644 index 0000000..18fc995 --- /dev/null +++ b/definitions/output/blink_features/usage.js @@ -0,0 +1,55 @@ +publish("usage", { + schema: "blink_features", + type: "incremental", + protected: true, + tags: ["blink_features_report"] +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE yyyymmdd = REPLACE('${constants.current_month}', '-', ''); +`).query(ctx => ` +SELECT + REPLACE(CAST(date AS STRING), '-', '') AS yyyymmdd, + client, + id, + feature, + type, + num_urls, + total_urls, + num_urls / total_urls AS pct_urls, + sample_urls +FROM ( + SELECT + yyyymmdd AS date, + client, + id, + feature, + type, + COUNT(DISTINCT url) AS num_urls, + ARRAY_AGG(url ORDER BY rank, url LIMIT 100) AS sample_urls + FROM ${ctx.ref("blink_features", "features")} + WHERE + yyyymmdd = '${constants.current_month}' + ${constants.dev_rank_filter} + GROUP BY + yyyymmdd, + client, + id, + feature, + type +) +JOIN ( + SELECT + date, + client, + COUNT(DISTINCT page) AS total_urls + FROM ${ctx.ref("all", "pages")} + WHERE + date = '${constants.current_month}' AND + is_root_page = TRUE + ${constants.dev_rank_filter} + GROUP BY + date, + client +) +USING (date, client) +`) diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index cde4893..6a1d7cb 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -114,7 +114,8 @@ technologies AS ( ${ctx.resolve("all", "pages")}, UNNEST(technologies) AS technology WHERE - date = '${past_month}' ${constants.dev_rank_filter} AND + date = '${past_month}' + ${constants.dev_rank_filter} AND technology.technology IS NOT NULL AND technology.technology != '' UNION ALL @@ -125,7 +126,8 @@ UNION ALL FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' ${constants.dev_rank_filter} + date = '${past_month}' + ${constants.dev_rank_filter} ), categories AS ( @@ -137,7 +139,8 @@ categories AS ( UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' ${constants.dev_rank_filter} + date = '${past_month}' + ${constants.dev_rank_filter} GROUP BY app UNION ALL @@ -149,8 +152,9 @@ UNION ALL UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' ${constants.dev_rank_filter} AND + date = '${past_month}' AND client = 'mobile' + ${constants.dev_rank_filter} ), summary_stats AS ( @@ -165,7 +169,8 @@ summary_stats AS ( FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' ${constants.dev_rank_filter} + date = '${past_month}' + ${constants.dev_rank_filter} ), lab_data AS ( diff --git a/definitions/output/technologies.js b/definitions/output/technologies.js index 116e1be..10bd83c 100644 --- a/definitions/output/technologies.js +++ b/definitions/output/technologies.js @@ -18,7 +18,8 @@ UNNEST (tech.categories) AS category, UNNEST (tech.info) AS info WHERE date = '${constants.current_month}' AND client = '${client}' AND - is_root_page AND + is_root_page + ${constants.dev_rank_filter} AND tech.technology IS NOT NULL `); }) diff --git a/includes/constants.js b/includes/constants.js index cead541..dc1690c 100644 --- a/includes/constants.js +++ b/includes/constants.js @@ -14,7 +14,7 @@ const dev_rank_filter ] = dataform.projectConfig.vars.env_name == 'dev' ? [ "TABLESAMPLE SYSTEM (0.001 PERCENT)", - "AND rank = 5000", + "AND rank <= 10000", true ] : ["", ""]; diff --git a/src/index.js b/src/index.js index 23d3bd5..777ecee 100644 --- a/src/index.js +++ b/src/index.js @@ -34,7 +34,7 @@ FROM ( action: "runDataformRepo", actionArgs: { repoName: "crawl-data", - tags: ["crawl_results_all", "crawl_results_legacy"] + tags: ["crawl_results_all", "blink_features_report", "crawl_results_legacy"] } } };