From e36a6fba514bda04bc478454dad8847e25bf1271 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:54:46 +0200 Subject: [PATCH 1/5] Bump @google-cloud/bigquery from 7.9.0 to 7.9.1 in /src (#13) Bumps [@google-cloud/bigquery](https://github.com/googleapis/nodejs-bigquery) from 7.9.0 to 7.9.1. - [Release notes](https://github.com/googleapis/nodejs-bigquery/releases) - [Changelog](https://github.com/googleapis/nodejs-bigquery/blob/main/CHANGELOG.md) - [Commits](https://github.com/googleapis/nodejs-bigquery/compare/v7.9.0...v7.9.1) --- updated-dependencies: - dependency-name: "@google-cloud/bigquery" dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/package-lock.json | 9 ++++----- src/package.json | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/package-lock.json b/src/package-lock.json index 525b592..a1f30be 100644 --- a/src/package-lock.json +++ b/src/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "@google-cloud/bigquery": "7.9.0", + "@google-cloud/bigquery": "7.9.1", "@google-cloud/dataform": "1.3.0", "@google-cloud/functions-framework": "3.4.2" }, @@ -51,10 +51,9 @@ } }, "node_modules/@google-cloud/bigquery": { - "version": "7.9.0", - "resolved": "https://registry.npmjs.org/@google-cloud/bigquery/-/bigquery-7.9.0.tgz", - "integrity": "sha512-KJTimGLDlAR1IfZ4Y8xhIVfoZ+XBXd0GGuJttLSXxtR0g+4vNsUt0xS33PRVa5TXey97374yU+uWNlCb5bHwBw==", - "license": "Apache-2.0", + "version": "7.9.1", + "resolved": "https://registry.npmjs.org/@google-cloud/bigquery/-/bigquery-7.9.1.tgz", + "integrity": "sha512-ZkcRMpBoFLxIh6TiQBywA22yT3c2j0f07AHWEMjtYqMQzZQbFrpxuJU2COp3tyjZ91ZIGHe4gY7/dGZL88cltg==", "dependencies": { "@google-cloud/common": "^5.0.0", "@google-cloud/paginator": "^5.0.2", diff --git a/src/package.json b/src/package.json index b8998b5..6e6a713 100644 --- a/src/package.json +++ b/src/package.json @@ -9,7 +9,7 @@ }, "dependencies": { "@google-cloud/functions-framework": "3.4.2", - "@google-cloud/bigquery": "7.9.0", + "@google-cloud/bigquery": "7.9.1", "@google-cloud/dataform": "1.3.0" } } From 1a3703fcec42a1a912eb8588fcfce4b56cd55752 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 30 Sep 2024 21:45:35 +0200 Subject: [PATCH 2/5] Update README.md Error Monitoring --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index e56ba3d..17cb0ca 100644 --- a/README.md +++ b/README.md @@ -60,3 +60,11 @@ Tag: `crawl_results_legacy` 1. In workflow settings vars set `dev_name: dev` to process sampled data in dev workspace. 2. Change `current_month` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data. 3. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default. + +### Error Monitoring + +The issues within the pipeline are being tracked using the following alerts: +1. the event trigger processing fails - [Dataform Trigger Function Error](https://console.cloud.google.com/monitoring/alerting/policies/3950167380893746326?authuser=7&project=httparchive) +2. a job in the workflow fails - "[Dataform Workflow Invocation Failed](https://console.cloud.google.com/monitoring/alerting/policies/7137542315653007241?authuser=7&project=httparchive) + +Error notifications are sent to [#10x-infra](https://httparchive.slack.com/archives/C030V4WAVL3) Slack channel. From 6640ffe145a4aee9bf060a434ea783f1c9c3d953 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 13:04:48 +0200 Subject: [PATCH 3/5] Bump @dataform/core from 3.0.2 to 3.0.4 (#15) Bumps [@dataform/core](https://github.com/dataform-co/dataform) from 3.0.2 to 3.0.4. - [Release notes](https://github.com/dataform-co/dataform/releases) - [Commits](https://github.com/dataform-co/dataform/compare/3.0.2...3.0.4) --- updated-dependencies: - dependency-name: "@dataform/core" dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- package-lock.json | 14 +++++++------- package.json | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3fc3b9d..159edf4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,20 +6,20 @@ "": { "name": "crawl-data", "dependencies": { - "@dataform/core": "3.0.2" + "@dataform/core": "3.0.4" } }, "node_modules/@dataform/core": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.2.tgz", - "integrity": "sha512-W1DQuv1vSsQgUPlNXuBTqa927FfNp5pNswOTz0wlX3cJTk5OT0UEENVmVJu/WIEdoVcCRTn8pn2dNNc1IjtMcA==" + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.4.tgz", + "integrity": "sha512-qlCugswggeO5g0qAxn4U3Kt9dzebSiM8gXZmNM3NkVUXloJBwOpbagkQ9wDXymio1FnMC4qwQX7jniz1KzKHmg==" } }, "dependencies": { "@dataform/core": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.2.tgz", - "integrity": "sha512-W1DQuv1vSsQgUPlNXuBTqa927FfNp5pNswOTz0wlX3cJTk5OT0UEENVmVJu/WIEdoVcCRTn8pn2dNNc1IjtMcA==" + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.4.tgz", + "integrity": "sha512-qlCugswggeO5g0qAxn4U3Kt9dzebSiM8gXZmNM3NkVUXloJBwOpbagkQ9wDXymio1FnMC4qwQX7jniz1KzKHmg==" } } } diff --git a/package.json b/package.json index 5a97497..c98cb20 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "crawl-data", "dependencies": { - "@dataform/core": "3.0.2" + "@dataform/core": "3.0.4" } } From 94718f951288f59cbae801223dfc53a7112d500a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 7 Oct 2024 22:17:53 +0200 Subject: [PATCH 4/5] Stable `all.requests` (#5) * pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * Initial commit * init * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * Dev (#1) * workspace init * pages and legacy lighthouse * fix * first test * pages insert * date var * js rewrite * dataset * core_web_vitals * clean graph, tested * publish core_web_vitals.technologies * technologies partitioning * past month date for cwv * 8pm * package-lock.json * ignore full-refresh * readme * updated tags and example assert * dependency assertions * current month commented * assert fix * all tables publish * incremental tables * node script * enable legacy * missing package name * table configs * all.requests and all.parsed_css * dev sampling vars * sampling instead of rank * readme upd * dev hints * dev sampling for tech report * tech report workflow * removed sampling * dates flexibility * fix * formatting * other legacy tables * docs and dependencies * comment * Update definitions/output/pages.js Co-authored-by: Barry Pollard * Update definitions/output/technologies.js Co-authored-by: Barry Pollard * Update package.json Co-authored-by: Barry Pollard * Update workflow_settings.yaml Co-authored-by: Barry Pollard * format * not dependent on all.pages * migrated to function trigger * cloud function * readme update * deployed function * readme updates * readme update * init stable copies * requests ready * adjusted requests pipeline * use release configs in prod * readme update * tags update * dev sampling * prune summary * sorted * false when target exists * dev sampling * newline * trigger cleanup * formatting * forEach iteration * create table with operate * new test tables script * tested * merge * JSON columns * job per client * native object pruning * Update definitions/output/all/reprocess_requests.js Co-authored-by: Barry Pollard --------- Co-authored-by: Barry Pollard --- README.md | 6 +- definitions/extra/test_env.js | 44 ++++--- definitions/output/all/reprocess_requests.js | 120 ++++++++++++++++++ .../output/core_web_vitals/technologies.js | 28 ++-- definitions/sources/declares.js | 8 +- src/dataform.js | 12 -- 6 files changed, 169 insertions(+), 49 deletions(-) create mode 100644 definitions/output/all/reprocess_requests.js diff --git a/README.md b/README.md index 17cb0ca..b6f77fc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # HTTP Archive BigQuery pipeline with Dataform -## Tables +This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery. + +## Pipelines + +The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run. ### Crawl tables in `all` dataset diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index 57f56bc..e1fdc29 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,26 +1,28 @@ -const two_months_ago = constants.fn_past_month(constants.fn_past_month(constants.current_month)); +const date = constants.fn_past_month(constants.current_month); -operate("test_env", { - hasOutput: true, - disabled: true // MUST NOT be commented in main branch -}).queries(ctx => ` -CREATE OR REPLACE TABLE ${ctx.ref("all", "pages")} AS -SELECT * -FROM httparchive.all.pages ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +var resources_list = [ + //{datasetId: "all", tableId: "pages"}, + {datasetId: "all", tableId: "requests"}, + //{datasetId: "all", tableId: "parsed_css"}, + //{datasetId: "core_web_vitals", tableId: "technologies"}, +]; -CREATE OR REPLACE TABLE ${ctx.ref("all", "requests")} AS -SELECT * -FROM httparchive.all.requests ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +resources_list.forEach(resource => { + operate(`test_table ${resource.datasetId}_${resource.tableId}`, { + disabled: !constants.is_dev_env // enabled when workflow variable env_name = "dev" + }).tags([ + "test_tables" + ]).queries(ctx => ` +CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev; -CREATE OR REPLACE TABLE ${ctx.ref("all", "parsed_css")} AS -SELECT * -FROM httparchive.all.parsed_css ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId}; + +CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId} +LIKE httparchive.${resource.datasetId}.${resource.tableId}; -CREATE OR REPLACE TABLE ${ctx.ref("core_web_vitals", "technologies")} AS +INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId} SELECT * -FROM httparchive.core_web_vitals.technologies -WHERE date = '${two_months_ago}' -`) +FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE} +WHERE date = '${date}' + `); +}) \ No newline at end of file diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js new file mode 100644 index 0000000..5b3bc1f --- /dev/null +++ b/definitions/output/all/reprocess_requests.js @@ -0,0 +1,120 @@ +operate(`all_requests_stable_pre`).tags( + ["all_requests_stable"] +).queries(` +CREATE SCHEMA IF NOT EXISTS all_dev; + +DROP TABLE IF EXISTS \`all_dev.requests_stable\`; + +CREATE TABLE \`all_dev.requests_stable\` +( + date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"), + client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"), + page STRING NOT NULL OPTIONS(description="The URL of the page being tested"), + is_root_page BOOL OPTIONS(description="Whether the page is the root of the origin."), + root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested"), + rank INT64 OPTIONS(description="Site popularity rank, from CrUX"), + url STRING NOT NULL OPTIONS(description="The URL of the request"), + is_main_document BOOL NOT NULL OPTIONS(description="Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects"), + type STRING OPTIONS(description="Simplified description of the type of resource (script, html, css, text, other, etc)"), + index INT64 OPTIONS(description="The sequential 0-based index of the request"), + payload JSON OPTIONS(description="JSON-encoded WebPageTest result data for this request"), + summary JSON OPTIONS(description="JSON-encoded summarization of request data"), + request_headers ARRAY> OPTIONS(description="Request headers"), + response_headers ARRAY> OPTIONS(description="Response headers"), + response_body STRING OPTIONS(description="Text-based response body") +) +PARTITION BY date +CLUSTER BY client, is_root_page, type, rank +OPTIONS( + require_partition_filter=true +); +`); + +const iterations = []; +const clients = constants.clients; + +for ( + let month = constants.current_month; + month >= '2024-09-01'; // 2022-07-01 + month = constants.fn_past_month(month)) { + clients.forEach((client) => { + iterations.push({ + month: month, + client: client + }) + }) +} + +iterations.forEach((iteration, i) => { + operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags( + ["all_requests_stable"] + ).dependencies([ + i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}` + ]).queries(ctx => ` +INSERT INTO \`all_dev.requests_stable\` +SELECT + requests.date, + requests.client, + requests.page, + requests.is_root_page, + requests.root_page, + crux.rank, + requests.url, + requests.is_main_document, + requests.type, + requests.index, + JSON_REMOVE( + SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + '$._headers' + ) AS payload, + JSON_REMOVE( + SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'), + '$.firstHtml', + '$.firstReq', + '$.req_accept_encoding', + '$.req_accept_language', + '$.req_accept', + '$.req_if_modified_since', + '$.req_if_none_match', + '$.req_referer', + '$.req_user_agent', + '$.reqOtherHeaders', + '$.requestid', + '$.resp_age', + '$.resp_cache_control', + '$.resp_content_length', + '$.resp_content_type', + '$.resp_date', + '$.resp_etag', + '$.resp_last_modified', + '$.resp_server', + '$.resp_vary', + '$.respOtherHeaders', + '$.startedDateTime', + '$.url', + '$.urlShort' + ) as summary, + requests.request_headers, + requests.response_headers, + requests.response_body +FROM ( + SELECT * + FROM \`all.requests\` ${constants.dev_TABLESAMPLE} + WHERE date = '${iteration.month}' + AND client = '${iteration.client}') AS requests +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + WHERE yyyymm = ${constants.fn_past_month(iteration.month).substring(0, 7).replace('-', '')} +) AS crux +ON requests.root_page = crux.page; + `) +}); diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 111a586..49c578f 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -24,7 +24,7 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F ); CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING) -RETURNS STRUCT +RETURNS STRUCT LANGUAGE js AS ''' try { const $ = JSON.parse(categories); @@ -74,7 +74,7 @@ crux AS ( END AS rank, CONCAT(origin, '/') AS root_page_url, IF(device = 'desktop', 'desktop', 'mobile') AS client, - + # CWV IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid, IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid, @@ -82,15 +82,15 @@ crux AS ( IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls, IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp, IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp, - + (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND IS_GOOD(small_cls, medium_cls, large_cls) AND IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024, - + (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND IS_GOOD(small_cls, medium_cls, large_cls) AND IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023, - + # WV IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp, IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp, @@ -114,7 +114,7 @@ technologies AS ( ${ctx.resolve("all", "pages")}, UNNEST(technologies) AS technology WHERE - date = '${past_month}' AND + date = '${past_month}' ${constants.dev_rank5000_filter} AND technology.technology IS NOT NULL AND technology.technology != '' UNION ALL @@ -125,7 +125,7 @@ UNION ALL FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} ), categories AS ( @@ -137,7 +137,7 @@ categories AS ( UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} GROUP BY app UNION ALL @@ -149,7 +149,7 @@ UNION ALL UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' AND + date = '${past_month}' ${constants.dev_rank5000_filter} AND client = 'mobile' ), @@ -165,7 +165,7 @@ summary_stats AS ( FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} ), lab_data AS ( @@ -206,7 +206,7 @@ SELECT app, client, COUNT(0) AS origins, - + # CrUX data COUNTIF(good_fid) AS origins_with_good_fid, COUNTIF(good_cls) AS origins_with_good_cls, @@ -227,19 +227,19 @@ SELECT SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv, SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024, SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023, - + # Lighthouse data APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS median_lighthouse_score_accessibility, APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS median_lighthouse_score_best_practices, APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS median_lighthouse_score_performance, APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS median_lighthouse_score_pwa, APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS median_lighthouse_score_seo, - + # Page weight stats APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS median_bytes_total, APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS median_bytes_js, APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS median_bytes_image - + FROM lab_data JOIN diff --git a/definitions/sources/declares.js b/definitions/sources/declares.js index ca29d1e..87eb47a 100644 --- a/definitions/sources/declares.js +++ b/definitions/sources/declares.js @@ -16,7 +16,7 @@ for (const table of crux_tables) { }); assert(`${table}_not_empty`).query(ctx => ` -SELECT +SELECT 'No data for the specified date' AS error_message FROM ${ctx.ref("chrome-ux-report", "materialized", table)} WHERE yyyymm = ${past_month} @@ -24,3 +24,9 @@ GROUP BY yyyymm HAVING COUNT(1) = 0 `); } + +declare({ + database: "chrome-ux-report", + schema: "experimental", + name: "global", +}); diff --git a/src/dataform.js b/src/dataform.js index 5b8159b..19a02f7 100644 --- a/src/dataform.js +++ b/src/dataform.js @@ -14,18 +14,6 @@ async function get_compilation_results(repoURI) { compilationResult: { releaseConfig: `${repoURI}/releaseConfigs/production` } - }, dev_request = { - parent: repoURI, - compilationResult: { - gitCommitish: 'dev' - }, - codeCompilationConfig: { - schemaSuffix: 'dev', - tablePrefix: 'dev', - vars: { - current_month: '2024-08-01', - }, - } }; console.log(`Creating Dataform compilation result: ${JSON.stringify(request, null, 2)}`); From af749fc8bd6c49dde627ae62a4d4fb73f3df1723 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 8 Oct 2024 18:20:08 +0200 Subject: [PATCH 5/5] Stable `all.pages` (#8) * draft * operations instead of publish * function updates * add column ddl * alter query post update * column descriptions * comment * monthly iterated test * optimise test tables creation * iterating using forEach * table reprocessing * more metrics * one more custom metrics * sort * origin-trials removed from other * native json pruning * more payload pruning * further request headers cleanup * test variables * crux * crux cleanup --- definitions/extra/test_env.js | 4 +- definitions/output/all/reprocess_pages.js | 267 +++++++++++++++++++ definitions/output/all/reprocess_requests.js | 40 +-- includes/constants.js | 13 +- workflow_settings.yaml | 4 +- 5 files changed, 300 insertions(+), 28 deletions(-) create mode 100644 definitions/output/all/reprocess_pages.js diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index e1fdc29..b1c5323 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,7 +1,7 @@ const date = constants.fn_past_month(constants.current_month); var resources_list = [ - //{datasetId: "all", tableId: "pages"}, + {datasetId: "all", tableId: "pages"}, {datasetId: "all", tableId: "requests"}, //{datasetId: "all", tableId: "parsed_css"}, //{datasetId: "core_web_vitals", tableId: "technologies"}, @@ -25,4 +25,4 @@ SELECT * FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE} WHERE date = '${date}' `); -}) \ No newline at end of file +}) diff --git a/definitions/output/all/reprocess_pages.js b/definitions/output/all/reprocess_pages.js new file mode 100644 index 0000000..5f5c4f2 --- /dev/null +++ b/definitions/output/all/reprocess_pages.js @@ -0,0 +1,267 @@ +operate(`all_pages_stable_pre`).tags( + ["all_pages_stable"] +).queries(` +CREATE SCHEMA IF NOT EXISTS all_dev; + +DROP TABLE IF EXISTS \`all_dev.pages_stable\`; + +CREATE TABLE \`all_dev.pages_stable\` +( + date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"), + client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"), + page STRING NOT NULL OPTIONS(description="The URL of the page being tested"), + is_root_page BOOL NOT NULL OPTIONS(description="Whether the page is the root of the origin"), + root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested, the origin followed by /"), + rank INT64 OPTIONS(description="Site popularity rank, from CrUX"), + wptid STRING OPTIONS(description="ID of the WebPageTest results"), + payload JSON OPTIONS(description="JSON-encoded WebPageTest results for the page"), + summary JSON OPTIONS(description="JSON-encoded summarization of the page-level data"), + custom_metrics STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + > OPTIONS(description="Custom metrics from WebPageTest"), + lighthouse JSON OPTIONS(description="JSON-encoded Lighthouse report"), + features ARRAY> OPTIONS(description="Blink features detected at runtime (see https://chromestatus.com/features)"), + technologies ARRAY OPTIONS(description="List of categories to which this technology belongs"), + info ARRAY OPTIONS(description="Additional metadata about the detected technology, ie version number") + >> OPTIONS(description="Technologies detected at runtime (see https://www.wappalyzer.com/)"), + metadata JSON OPTIONS(description="Additional metadata about the test") +) +PARTITION BY date +CLUSTER BY client, is_root_page, rank, page +OPTIONS( + require_partition_filter=true +); +`); + + +const iterations = []; +const clients = constants.clients; + +for ( + let month = constants.current_month; + month >= '2024-09-01'; // 2022-07-01 + month = constants.fn_past_month(month)) { + clients.forEach((client) => { + iterations.push({ + month: month, + client: client + }) + }) +} + +iterations.forEach((iteration, i) => { + operate(`all_pages_stable_update ${iteration.month} ${iteration.client}`).tags([ + "all_pages_stable" + ]).dependencies([ + i===0 ? "all_pages_stable_pre" : `all_pages_stable_update ${iterations[i-1].month} ${iterations[i-1].client}` + ]).queries(ctx => ` +INSERT INTO \`all_dev.pages_stable\` +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + JSON_REMOVE( + SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + JSON_SET( + JSON_REMOVE( + SAFE.PARSE_JSON(summary, wide_number_mode => 'round'), + '$._adult_site', + '$.archive', + '$.avg_dom_depth', + '$.crawlid', + '$.createDate', + '$.doctype', + '$.document_height', + '$.document_width', + '$.label', + '$.localstorage_size', + '$.meta_viewport', + '$.metadata', + '$.num_iframes', + '$.num_scripts_async', + '$.num_scripts_sync', + '$.num_scripts', + '$.pageid', + '$.PageSpeed', + '$.rank', + '$.sessionstorage_size', + '$.startedDateTime', + '$.url', + '$.urlhash', + '$.urlShort', + '$.usertiming', + '$.wptid', + '$.wptrun' + ), + "$.crux", + JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._CrUX") + ) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.a11y"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.cms"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.cookies"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.css-variables"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.ecommerce"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.element_count"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.javascript"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.markup"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.media"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.origin-trials"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.performance"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.privacy"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.responsive_images"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.robots_txt"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.security"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.structured-data"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.third-parties"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.well-known"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.wpt_bodies"), + JSON_REMOVE( + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), + '$.a11y', + '$.cms', + '$.cookies', + '$.css-variables', + '$.ecommerce', + '$.element_count', + '$.javascript', + '$.markup', + '$.media', + '$.origin-trials', + '$.performance', + '$.privacy', + '$.responsive_images', + '$.robots_txt', + '$.security', + '$.structured-data', + '$.third-parties', + '$.well-known', + '$.wpt_bodies' + ) + ) AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + features, + technologies, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata +FROM \`all.pages\` +WHERE + date = "${iteration.month}" AND + client = "${iteration.client}" ${constants.dev_rank_filter}; + `) +}) \ No newline at end of file diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js index 5b3bc1f..af0149d 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/all/reprocess_requests.js @@ -57,6 +57,22 @@ iterations.forEach((iteration, i) => { ).dependencies([ i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}` ]).queries(ctx => ` +CREATE TEMP FUNCTION PRUNE_HEADERS( + jsonObject JSON +) RETURNS JSON +LANGUAGE js AS """ +try { + for (const [key, value] of Object.entries(jsonObject)) { + if(key.startsWith('req_') || key.startsWith('resp_')) { + delete jsonObject[key]; + } + } + return jsonObject; +} catch (e) { + return null; +} +"""; + INSERT INTO \`all_dev.requests_stable\` SELECT requests.date, @@ -71,35 +87,21 @@ SELECT requests.index, JSON_REMOVE( SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), - '$._headers' + '$._headers', + '$.request.headers', + '$.response.headers' ) AS payload, - JSON_REMOVE( + PRUNE_HEADERS(JSON_REMOVE( SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'), '$.firstHtml', '$.firstReq', - '$.req_accept_encoding', - '$.req_accept_language', - '$.req_accept', - '$.req_if_modified_since', - '$.req_if_none_match', - '$.req_referer', - '$.req_user_agent', '$.reqOtherHeaders', '$.requestid', - '$.resp_age', - '$.resp_cache_control', - '$.resp_content_length', - '$.resp_content_type', - '$.resp_date', - '$.resp_etag', - '$.resp_last_modified', - '$.resp_server', - '$.resp_vary', '$.respOtherHeaders', '$.startedDateTime', '$.url', '$.urlShort' - ) as summary, + )) as summary, requests.request_headers, requests.response_headers, requests.response_body diff --git a/includes/constants.js b/includes/constants.js index 40054b2..fd4bed5 100644 --- a/includes/constants.js +++ b/includes/constants.js @@ -11,11 +11,13 @@ const booleans = ['TRUE', 'FALSE'], [ dev_TABLESAMPLE, - dev_rank5000_filter - ] = dataform.projectConfig.vars.env_name == "dev" ? [ + dev_rank_filter, + is_dev_env + ] = dataform.projectConfig.vars.env_name == 'dev' ? [ "TABLESAMPLE SYSTEM (0.001 PERCENT)", - "AND rank = 5000" - ] : ["", ""]; + "AND rank <= 10000", + true + ] : ["", "", false]; module.exports = { today, @@ -25,5 +27,6 @@ module.exports = { clients, booleans, dev_TABLESAMPLE, - dev_rank5000_filter + dev_rank_filter, + is_dev_env }; diff --git a/workflow_settings.yaml b/workflow_settings.yaml index dcbbe42..4aeb4eb 100644 --- a/workflow_settings.yaml +++ b/workflow_settings.yaml @@ -5,5 +5,5 @@ defaultAssertionDataset: dataform_assertions vars: placeholder: value # keeping it to avoid 'empty vars' error - # env_name: dev # MUST be commented in main branch, enables processing sampled data - # today: 2024-09-01 # MUST be commented in main branch, allows processing historical data + env_name: dev # MUST be commented in main branch, enables processing sampled data + today: 2024-09-20 # MUST be commented in main branch, allows processing historical data