Skip to content

Commit

Permalink
Merge branch 'main' into close-whitefish
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko authored Oct 8, 2024
2 parents 7614512 + af749fc commit 16fd375
Show file tree
Hide file tree
Showing 13 changed files with 466 additions and 88 deletions.
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# HTTP Archive BigQuery pipeline with Dataform

## Datasets
This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.

## Pipelines

The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run.

### Crawl results

Expand Down Expand Up @@ -70,3 +74,11 @@ Tag: `crawl_results_legacy`
1. In workflow settings vars set `dev_name: dev` to process sampled data in dev workspace.
2. Change `today` variable to a date in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.
3. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default.

### Error Monitoring

The issues within the pipeline are being tracked using the following alerts:
1. the event trigger processing fails - [Dataform Trigger Function Error](https://console.cloud.google.com/monitoring/alerting/policies/3950167380893746326?authuser=7&project=httparchive)
2. a job in the workflow fails - "[Dataform Workflow Invocation Failed](https://console.cloud.google.com/monitoring/alerting/policies/7137542315653007241?authuser=7&project=httparchive)

Error notifications are sent to [#10x-infra](https://httparchive.slack.com/archives/C030V4WAVL3) Slack channel.
59 changes: 20 additions & 39 deletions definitions/extra/test_env.js
Original file line number Diff line number Diff line change
@@ -1,45 +1,26 @@
const past_month = constants.fn_past_month(constants.current_month);
const date = constants.fn_past_month(constants.current_month);

operate("test_env", {
hasOutput: true,
disabled: true // MUST be disabled in main branch
}).queries(ctx => `
CREATE SCHEMA IF NOT EXISTS all_dev;
var resources_list = [
{datasetId: "all", tableId: "pages"},
{datasetId: "all", tableId: "requests"},
//{datasetId: "all", tableId: "parsed_css"},
//{datasetId: "core_web_vitals", tableId: "technologies"},
];

CREATE TABLE IF NOT EXISTS ${ctx.resolve("all", "pages")} AS
SELECT *
FROM httparchive.all.pages
WHERE
date = '${constants.current_month}'
${constants.dev_rank5000_filter};
CREATE TABLE IF NOT EXISTS ${ctx.resolve("all", "requests")} AS
SELECT *
FROM httparchive.all.requests ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}';
CREATE TABLE IF NOT EXISTS ${ctx.resolve("all", "parsed_css")} AS
SELECT *
FROM httparchive.all.parsed_css
WHERE date = '${constants.current_month}'
${constants.dev_rank5000_filter};
resources_list.forEach(resource => {
operate(`test_table ${resource.datasetId}_${resource.tableId}`, {
hasOutput: true
}).queries(ctx => `
CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev;
CREATE SCHEMA IF NOT EXISTS core_web_vitals_dev;
DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId};
CREATE TABLE IF NOT EXISTS ${ctx.resolve("core_web_vitals", "technologies")} AS
SELECT *
FROM httparchive.core_web_vitals.technologies ${constants.dev_TABLESAMPLE}
WHERE date = '${past_month}';
CREATE SCHEMA IF NOT EXISTS blink_features_dev;
CREATE TABLE IF NOT EXISTS ${ctx.resolve("blink_features", "usage")} AS
SELECT *
FROM httparchive.blink_features.usage ${constants.dev_TABLESAMPLE}
WHERE yyyymmdd = '${past_month}';
CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId}
LIKE httparchive.${resource.datasetId}.${resource.tableId};
CREATE TABLE IF NOT EXISTS ${ctx.resolve("blink_features", "features")} AS
INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId}
SELECT *
FROM httparchive.blink_features.features ${constants.dev_TABLESAMPLE}
WHERE yyyymmdd = DATE '${past_month}';
`)
FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE}
WHERE date = '${date}'
`);
});
267 changes: 267 additions & 0 deletions definitions/output/all/reprocess_pages.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
operate(`all_pages_stable_pre`).tags(
["all_pages_stable"]
).queries(`
CREATE SCHEMA IF NOT EXISTS all_dev;
DROP TABLE IF EXISTS \`all_dev.pages_stable\`;
CREATE TABLE \`all_dev.pages_stable\`
(
date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"),
client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"),
page STRING NOT NULL OPTIONS(description="The URL of the page being tested"),
is_root_page BOOL NOT NULL OPTIONS(description="Whether the page is the root of the origin"),
root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested, the origin followed by /"),
rank INT64 OPTIONS(description="Site popularity rank, from CrUX"),
wptid STRING OPTIONS(description="ID of the WebPageTest results"),
payload JSON OPTIONS(description="JSON-encoded WebPageTest results for the page"),
summary JSON OPTIONS(description="JSON-encoded summarization of the page-level data"),
custom_metrics STRUCT<
a11y JSON,
cms JSON,
cookies JSON,
css_variables JSON,
ecommerce JSON,
element_count JSON,
javascript JSON,
markup JSON,
media JSON,
origin_trials JSON,
performance JSON,
privacy JSON,
responsive_images JSON,
robots_txt JSON,
security JSON,
structured_data JSON,
third_parties JSON,
well_known JSON,
wpt_bodies JSON,
other JSON
> OPTIONS(description="Custom metrics from WebPageTest"),
lighthouse JSON OPTIONS(description="JSON-encoded Lighthouse report"),
features ARRAY<STRUCT<
feature STRING OPTIONS(description="Blink feature name"),
id STRING OPTIONS(description="Blink feature ID"),
type STRING OPTIONS(description="Blink feature type (css, default)")
>> OPTIONS(description="Blink features detected at runtime (see https://chromestatus.com/features)"),
technologies ARRAY<STRUCT<
technology STRING OPTIONS(description="Name of the detected technology"),
categories ARRAY<STRING> OPTIONS(description="List of categories to which this technology belongs"),
info ARRAY<STRING> OPTIONS(description="Additional metadata about the detected technology, ie version number")
>> OPTIONS(description="Technologies detected at runtime (see https://www.wappalyzer.com/)"),
metadata JSON OPTIONS(description="Additional metadata about the test")
)
PARTITION BY date
CLUSTER BY client, is_root_page, rank, page
OPTIONS(
require_partition_filter=true
);
`);


const iterations = [];
const clients = constants.clients;

for (
let month = constants.current_month;
month >= '2024-09-01'; // 2022-07-01
month = constants.fn_past_month(month)) {
clients.forEach((client) => {
iterations.push({
month: month,
client: client
})
})
}

iterations.forEach((iteration, i) => {
operate(`all_pages_stable_update ${iteration.month} ${iteration.client}`).tags([
"all_pages_stable"
]).dependencies([
i===0 ? "all_pages_stable_pre" : `all_pages_stable_update ${iterations[i-1].month} ${iterations[i-1].client}`
]).queries(ctx => `
INSERT INTO \`all_dev.pages_stable\`
SELECT
date,
client,
page,
is_root_page,
root_page,
rank,
wptid,
JSON_REMOVE(
SAFE.PARSE_JSON(payload, wide_number_mode => 'round'),
'$._metadata',
'$._detected',
'$._detected_apps',
'$._detected_technologies',
'$._detected_raw',
'$._custom',
'$._00_reset',
'$._a11y',
'$._ads',
'$._almanac',
'$._aurora',
'$._avg_dom_depth',
'$._cms',
'$._Colordepth',
'$._cookies',
'$._crawl_links',
'$._css-variables',
'$._css',
'$._doctype',
'$._document_height',
'$._document_width',
'$._Dpi',
'$._ecommerce',
'$._element_count',
'$._event-names',
'$._fugu-apis',
'$._generated-content',
'$._has_shadow_root',
'$._Images',
'$._img-loading-attr',
'$._initiators',
'$._inline_style_bytes',
'$._javascript',
'$._lib-detector-version',
'$._localstorage_size',
'$._markup',
'$._media',
'$._meta_viewport',
'$._num_iframes',
'$._num_scripts_async',
'$._num_scripts_sync',
'$._num_scripts',
'$._observers',
'$._origin-trials',
'$._parsed_css',
'$._performance',
'$._privacy-sandbox',
'$._privacy',
'$._pwa',
'$._quirks_mode',
'$._Resolution',
'$._responsive_images',
'$._robots_meta',
'$._robots_txt',
'$._sass',
'$._security',
'$._sessionstorage_size',
'$._structured-data',
'$._third-parties',
'$._usertiming',
'$._valid-head',
'$._well-known',
'$._wpt_bodies',
'$._blinkFeatureFirstUsed',
'$._CrUX'
) AS payload,
JSON_SET(
JSON_REMOVE(
SAFE.PARSE_JSON(summary, wide_number_mode => 'round'),
'$._adult_site',
'$.archive',
'$.avg_dom_depth',
'$.crawlid',
'$.createDate',
'$.doctype',
'$.document_height',
'$.document_width',
'$.label',
'$.localstorage_size',
'$.meta_viewport',
'$.metadata',
'$.num_iframes',
'$.num_scripts_async',
'$.num_scripts_sync',
'$.num_scripts',
'$.pageid',
'$.PageSpeed',
'$.rank',
'$.sessionstorage_size',
'$.startedDateTime',
'$.url',
'$.urlhash',
'$.urlShort',
'$.usertiming',
'$.wptid',
'$.wptrun'
),
"$.crux",
JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._CrUX")
) AS summary,
STRUCT<
a11y JSON,
cms JSON,
cookies JSON,
css_variables JSON,
ecommerce JSON,
element_count JSON,
javascript JSON,
markup JSON,
media JSON,
origin_trials JSON,
performance JSON,
privacy JSON,
responsive_images JSON,
robots_txt JSON,
security JSON,
structured_data JSON,
third_parties JSON,
well_known JSON,
wpt_bodies JSON,
other JSON
>(
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.a11y"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.cms"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.cookies"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.css-variables"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.ecommerce"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.element_count"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.javascript"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.markup"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.media"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.origin-trials"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.performance"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.privacy"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.responsive_images"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.robots_txt"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.security"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.structured-data"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.third-parties"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.well-known"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.wpt_bodies"),
JSON_REMOVE(
SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'),
'$.a11y',
'$.cms',
'$.cookies',
'$.css-variables',
'$.ecommerce',
'$.element_count',
'$.javascript',
'$.markup',
'$.media',
'$.origin-trials',
'$.performance',
'$.privacy',
'$.responsive_images',
'$.robots_txt',
'$.security',
'$.structured-data',
'$.third-parties',
'$.well-known',
'$.wpt_bodies'
)
) AS custom_metrics,
SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse,
features,
technologies,
SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata
FROM \`all.pages\`
WHERE
date = "${iteration.month}" AND
client = "${iteration.client}" ${constants.dev_rank_filter};
`)
})
Loading

0 comments on commit 16fd375

Please sign in to comment.