Skip to content

Commit

Permalink
formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed Oct 8, 2024
1 parent af749fc commit 4f09997
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 129 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,15 @@ Tag: `crawl_results_legacy`

### Dataform development workspace hints

1. In workflow settings vars set `dev_name: dev` to process sampled data in dev workspace.
2. Change `current_month` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.
3. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default.
1. In workflow settings vars:
1. set `env_name: dev` to process sampled data in dev workspace.
2. change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.
2. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default.

### Error Monitoring

The issues within the pipeline are being tracked using the following alerts:

1. the event trigger processing fails - [Dataform Trigger Function Error](https://console.cloud.google.com/monitoring/alerting/policies/3950167380893746326?authuser=7&project=httparchive)
2. a job in the workflow fails - "[Dataform Workflow Invocation Failed](https://console.cloud.google.com/monitoring/alerting/policies/7137542315653007241?authuser=7&project=httparchive)

Expand Down
17 changes: 8 additions & 9 deletions definitions/extra/test_env.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
const date = constants.fn_past_month(constants.current_month);

var resources_list = [
{datasetId: "all", tableId: "pages"},
{datasetId: "all", tableId: "requests"},
//{datasetId: "all", tableId: "parsed_css"},
//{datasetId: "core_web_vitals", tableId: "technologies"},
{ datasetId: "all", tableId: "pages" },
{ datasetId: "all", tableId: "requests" },
//{datasetId: "all", tableId: "parsed_css"},
//{datasetId: "core_web_vitals", tableId: "technologies"},
];

resources_list.forEach(resource => {
operate(`test_table ${resource.datasetId}_${resource.tableId}`, {
disabled: !constants.is_dev_env // enabled when workflow variable env_name = "dev"
}).tags([
"test_tables"
]).queries(ctx => `
operate(
`test_table ${resource.datasetId}_${resource.tableId}`,
{ hasOutput: true }
).queries(ctx => `
CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev;
DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId};
Expand Down
18 changes: 9 additions & 9 deletions definitions/output/all/pages.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
publish("pages", {
type: "incremental",
protected: true,
schema: "all",
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank"],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
type: "incremental",
protected: true,
schema: "all",
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank"],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
Expand Down
18 changes: 9 additions & 9 deletions definitions/output/all/parsed_css.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
publish("parsed_css", {
type: "incremental",
protected: true,
schema: "all",
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank", "page"],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
type: "incremental",
protected: true,
schema: "all",
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank", "page"],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
Expand Down
54 changes: 27 additions & 27 deletions definitions/output/all/reprocess_pages.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ CREATE TABLE \`all_dev.pages_stable\`
structured_data JSON,
third_parties JSON,
well_known JSON,
wpt_bodies JSON,
wpt_bodies JSON,
other JSON
> OPTIONS(description="Custom metrics from WebPageTest"),
lighthouse JSON OPTIONS(description="JSON-encoded Lighthouse report"),
Expand Down Expand Up @@ -66,19 +66,19 @@ for (
let month = constants.current_month;
month >= '2024-09-01'; // 2022-07-01
month = constants.fn_past_month(month)) {
clients.forEach((client) => {
iterations.push({
month: month,
client: client
})
clients.forEach((client) => {
iterations.push({
month: month,
client: client
})
})
}

iterations.forEach((iteration, i) => {
operate(`all_pages_stable_update ${iteration.month} ${iteration.client}`).tags([
"all_pages_stable"
]).dependencies([
i===0 ? "all_pages_stable_pre" : `all_pages_stable_update ${iterations[i-1].month} ${iterations[i-1].client}`
i === 0 ? "all_pages_stable_pre" : `all_pages_stable_update ${iterations[i - 1].month} ${iterations[i - 1].client}`
]).queries(ctx => `
INSERT INTO \`all_dev.pages_stable\`
SELECT
Expand Down Expand Up @@ -159,7 +159,7 @@ SELECT
) AS payload,
JSON_SET(
JSON_REMOVE(
SAFE.PARSE_JSON(summary, wide_number_mode => 'round'),
SAFE.PARSE_JSON(summary, wide_number_mode => 'round'),
'$._adult_site',
'$.archive',
'$.avg_dom_depth',
Expand Down Expand Up @@ -233,25 +233,25 @@ SELECT
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.well-known"),
JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.wpt_bodies"),
JSON_REMOVE(
SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'),
'$.a11y',
'$.cms',
'$.cookies',
'$.css-variables',
'$.ecommerce',
'$.element_count',
SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'),
'$.a11y',
'$.cms',
'$.cookies',
'$.css-variables',
'$.ecommerce',
'$.element_count',
'$.javascript',
'$.markup',
'$.media',
'$.origin-trials',
'$.performance',
'$.privacy',
'$.responsive_images',
'$.robots_txt',
'$.security',
'$.structured-data',
'$.third-parties',
'$.well-known',
'$.markup',
'$.media',
'$.origin-trials',
'$.performance',
'$.privacy',
'$.responsive_images',
'$.robots_txt',
'$.security',
'$.structured-data',
'$.third-parties',
'$.well-known',
'$.wpt_bodies'
)
) AS custom_metrics,
Expand All @@ -264,4 +264,4 @@ WHERE
date = "${iteration.month}" AND
client = "${iteration.client}" ${constants.dev_rank_filter};
`)
})
})
36 changes: 19 additions & 17 deletions definitions/output/all/reprocess_requests.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,19 @@ for (
let month = constants.current_month;
month >= '2024-09-01'; // 2022-07-01
month = constants.fn_past_month(month)) {
clients.forEach((client) => {
iterations.push({
month: month,
client: client
})
clients.forEach((client) => {
iterations.push({
month: month,
client: client
})
})
}

iterations.forEach((iteration, i) => {
operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags(
["all_requests_stable"]
).dependencies([
i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}`
i === 0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client}`
]).queries(ctx => `
CREATE TEMP FUNCTION PRUNE_HEADERS(
jsonObject JSON
Expand Down Expand Up @@ -91,17 +91,19 @@ SELECT
'$.request.headers',
'$.response.headers'
) AS payload,
PRUNE_HEADERS(JSON_REMOVE(
SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'),
'$.firstHtml',
'$.firstReq',
'$.reqOtherHeaders',
'$.requestid',
'$.respOtherHeaders',
'$.startedDateTime',
'$.url',
'$.urlShort'
)) as summary,
PRUNE_HEADERS(
JSON_REMOVE(
SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'),
'$.firstHtml',
'$.firstReq',
'$.reqOtherHeaders',
'$.requestid',
'$.respOtherHeaders',
'$.startedDateTime',
'$.url',
'$.urlShort'
)
) as summary,
requests.request_headers,
requests.response_headers,
requests.response_body
Expand Down
18 changes: 9 additions & 9 deletions definitions/output/all/requests.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
publish("requests", {
type: "incremental",
protected: true,
schema: "all",
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "is_main_document", "type"],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
type: "incremental",
protected: true,
schema: "all",
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "is_main_document", "type"],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
Expand Down
10 changes: 5 additions & 5 deletions definitions/output/core_web_vitals/technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ technologies AS (
${ctx.resolve("all", "pages")},
UNNEST(technologies) AS technology
WHERE
date = '${past_month}' ${constants.dev_rank5000_filter} AND
date = '${past_month}' ${constants.dev_rank_filter} AND
technology.technology IS NOT NULL AND
technology.technology != ''
UNION ALL
Expand All @@ -125,7 +125,7 @@ UNION ALL
FROM
${ctx.resolve("all", "pages")}
WHERE
date = '${past_month}' ${constants.dev_rank5000_filter}
date = '${past_month}' ${constants.dev_rank_filter}
),
categories AS (
Expand All @@ -137,7 +137,7 @@ categories AS (
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '${past_month}' ${constants.dev_rank5000_filter}
date = '${past_month}' ${constants.dev_rank_filter}
GROUP BY
app
UNION ALL
Expand All @@ -149,7 +149,7 @@ UNION ALL
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '${past_month}' ${constants.dev_rank5000_filter} AND
date = '${past_month}' ${constants.dev_rank_filter} AND
client = 'mobile'
),
Expand All @@ -165,7 +165,7 @@ summary_stats AS (
FROM
${ctx.resolve("all", "pages")}
WHERE
date = '${past_month}' ${constants.dev_rank5000_filter}
date = '${past_month}' ${constants.dev_rank_filter}
),
lab_data AS (
Expand Down
26 changes: 13 additions & 13 deletions definitions/sources/declares.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
const staging_tables = ["pages", "requests", "parsed_css"]
for (const table of staging_tables) {
declare({
schema: "crawl_staging",
name: table,
});
declare({
schema: "crawl_staging",
name: table,
});
}

const crux_tables = ["country_summary", "device_summary"];
const past_month = constants.fn_past_month(constants.current_month).substring(0, 7).replace("-", "");
for (const table of crux_tables) {
declare({
database: "chrome-ux-report",
schema: "materialized",
name: table,
});
declare({
database: "chrome-ux-report",
schema: "materialized",
name: table,
});

assert(`${table}_not_empty`).query(ctx => `
assert(`${table}_not_empty`).query(ctx => `
SELECT
'No data for the specified date' AS error_message
FROM ${ctx.ref("chrome-ux-report", "materialized", table)}
Expand All @@ -26,7 +26,7 @@ HAVING COUNT(1) = 0
}

declare({
database: "chrome-ux-report",
schema: "experimental",
name: "global",
database: "chrome-ux-report",
schema: "experimental",
name: "global",
});
Loading

0 comments on commit 4f09997

Please sign in to comment.