Skip to content

Commit

Permalink
Blink Features report (#11)
Browse files Browse the repository at this point in the history
* blink features report

* tested

* no reference

* formatted date

* Update README.md

Co-authored-by: Barry Pollard <[email protected]>

---------

Co-authored-by: Barry Pollard <[email protected]>
  • Loading branch information
max-ostapenko and tunetheweb authored Oct 8, 2024
1 parent 4f09997 commit 65f098c
Show file tree
Hide file tree
Showing 10 changed files with 224 additions and 47 deletions.
50 changes: 30 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,46 +6,56 @@ This repo handles the HTTP Archive data pipeline, which takes the results of the

The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run.

### Crawl tables in `all` dataset
### Crawl results

Tag: `crawl_results_all`

- [x] httparchive.all.pages
- [x] httparchive.all.parsed_css
- [x] httparchive.all.requests
- httparchive.all.pages
- httparchive.all.parsed_css
- httparchive.all.requests

### Core Web Vitals Technology Report

Tag: `cwv_tech_report`

- [x] httparchive.core_web_vitals.technologies
- httparchive.core_web_vitals.technologies

### Legacy crawl tables (to be deprecated)
Consumers:

- [HTTP Archive Tech Report](https://httparchive.org/reports/techreport/landing)

### Blink Features Report

Tag: `blink_features_report`

- httparchive.blink_features.features
- httparchive.blink_features.usage

Consumers:

- chromestatus.com - [example](https://chromestatus.com/metrics/feature/timeline/popularity/2089)

### Legacy crawl results (to be deprecated)

Tag: `crawl_results_legacy`

- [x] httparchive.lighthouse.YYYY_MM_DD_client
- [x] httparchive.pages.YYYY_MM_DD_client
- [x] httparchive.requests.YYYY_MM_DD_client
- [x] httparchive.response_bodies.YYYY_MM_DD_client
- [x] httparchive.summary_pages.YYYY_MM_DD_client
- [x] httparchive.summary_requests.YYYY_MM_DD_client
- [x] httparchive.technologies.YYYY_MM_DD_client
- httparchive.lighthouse.YYYY_MM_DD_client
- httparchive.pages.YYYY_MM_DD_client
- httparchive.requests.YYYY_MM_DD_client
- httparchive.response_bodies.YYYY_MM_DD_client
- httparchive.summary_pages.YYYY_MM_DD_client
- httparchive.summary_requests.YYYY_MM_DD_client
- httparchive.technologies.YYYY_MM_DD_client

## Schedules

1. [crawl-complete](https://console.cloud.google.com/cloudpubsub/subscription/detail/dataformTrigger?authuser=7&project=httparchive) PubSub subscription

Tags:

- crawl_results_all
- crawl_results_legacy
Tags: ["crawl_results_all", "blink_features_report", "crawl_results_legacy"]

2. [bq-poller-cwv-tech-report](https://console.cloud.google.com/cloudscheduler/jobs/edit/us-east4/bq-poller-cwv-tech-report?authuser=7&project=httparchive) Scheduler

Tags:

- cwv_tech_report
Tags: ["cwv_tech_report"]

### Triggering workflows

Expand Down
37 changes: 29 additions & 8 deletions definitions/extra/test_env.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const date = constants.fn_past_month(constants.current_month);
const date = constants.current_month;

var resources_list = [
{ datasetId: "all", tableId: "pages" },
Expand All @@ -11,17 +11,38 @@ resources_list.forEach(resource => {
operate(
`test_table ${resource.datasetId}_${resource.tableId}`,
{ hasOutput: true }
).queries(ctx => `
).queries(`
CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev;
DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId};
DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId};
CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId}
LIKE httparchive.${resource.datasetId}.${resource.tableId};
INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId}
CREATE TABLE IF NOT EXISTS ${resource.datasetId}_dev.dev_${resource.tableId} AS
SELECT *
FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE}
FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.dev_TABLESAMPLE}
WHERE date = '${date}'
`);
})

operate("test_table blink_features_dev_dev_usage", {
hasOutput: true,
}).queries(`
CREATE SCHEMA IF NOT EXISTS blink_features_dev;
CREATE TABLE IF NOT EXISTS blink_features_dev.dev_usage AS
SELECT *
FROM blink_features.usage ${constants.dev_TABLESAMPLE}
WHERE yyyymmdd = '${date}';
`)

operate("test_table blink_features_dev_dev_features", {
hasOutput: true,
}).queries(`
CREATE SCHEMA IF NOT EXISTS blink_features_dev;
DROP TABLE IF EXISTS blink_features_dev.dev_features;
CREATE TABLE IF NOT EXISTS blink_features_dev.dev_features AS
SELECT *
FROM blink_features.features ${constants.dev_TABLESAMPLE}
WHERE yyyymmdd = DATE '${date}';
`)
26 changes: 19 additions & 7 deletions definitions/output/all/pages.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,33 @@ DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
`).query(ctx => `
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}' AND client = 'desktop' AND is_root_page = TRUE
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
AND client = 'desktop'
AND is_root_page = TRUE
${constants.dev_rank_filter}
`).postOps(ctx => `
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}' AND client = 'desktop' AND is_root_page = FALSE;
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
AND client = 'desktop'
AND is_root_page = FALSE
${constants.dev_rank_filter};
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}' AND client = 'mobile' AND is_root_page = TRUE;
WHERE date = '${constants.current_month}'
AND client = 'mobile'
AND is_root_page = TRUE
${constants.dev_rank_filter};
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}' AND client = 'mobile' AND is_root_page = FALSE
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
AND client = 'mobile'
AND is_root_page = FALSE
${constants.dev_rank_filter};
`)
12 changes: 8 additions & 4 deletions definitions/output/all/parsed_css.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@ DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
`).query(ctx => `
SELECT *
FROM ${ctx.ref("crawl_staging", "parsed_css")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}' AND client = 'desktop'
FROM ${ctx.ref("crawl_staging", "parsed_css")}
WHERE date = '${constants.current_month}'
AND client = 'desktop'
${constants.dev_rank_filter}
`).postOps(ctx => `
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "parsed_css")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}' AND client = 'mobile'
FROM ${ctx.ref("crawl_staging", "parsed_css")}
WHERE date = '${constants.current_month}'
AND client = 'mobile'
${constants.dev_rank_filter};
`)
69 changes: 69 additions & 0 deletions definitions/output/blink_features/features.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

publish("features", {
schema: "blink_features",
type: "incremental",
protected: true,
bigquery: {
partitionBy: "yyyymmdd",
clusterBy: ["client", "rank"]
},
tags: ["blink_features_report"]
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE yyyymmdd = DATE '${constants.current_month}';
CREATE TEMP FUNCTION features(payload STRING)
RETURNS ARRAY<STRUCT<id STRING, name STRING, type STRING>> LANGUAGE js AS
"""
function getFeatureNames(featureMap, featureType) {
try {
return Object.entries(featureMap).map(([key, value]) => {
// After Feb 2020 keys are feature IDs.
if (value.name) {
return {'name': value.name, 'type': featureType, 'id': key};
}
// Prior to Feb 2020 keys fell back to IDs if the name was unknown.
if (idPattern.test(key)) {
return {'name': '', 'type': featureType, 'id': key.match(idPattern)[1]};
}
// Prior to Feb 2020 keys were names by default.
return {'name': key, 'type': featureType, 'id': ''};
});
} catch (e) {
return [];
}
}
var $ = JSON.parse(payload);
if (!$._blinkFeatureFirstUsed) return [];
var idPattern = new RegExp('^Feature_(\d+)$');
return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default')
.concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css'))
.concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css'));
""";
`).query(ctx => `
SELECT
date AS yyyymmdd,
client,
url,
feature.feature AS feature,
feature.type,
feature.id,
rank
FROM (
SELECT
date,
client,
page AS url,
payload,
rank,
feature
FROM ${ctx.ref("all", "pages")},
UNNEST(features) AS feature
WHERE
date = '${constants.current_month}' AND
is_root_page = TRUE
${constants.dev_rank_filter}
)
`)
55 changes: 55 additions & 0 deletions definitions/output/blink_features/usage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
publish("usage", {
schema: "blink_features",
type: "incremental",
protected: true,
tags: ["blink_features_report"]
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE yyyymmdd = REPLACE('${constants.current_month}', '-', '');
`).query(ctx => `
SELECT
REPLACE(CAST(date AS STRING), '-', '') AS yyyymmdd,
client,
id,
feature,
type,
num_urls,
total_urls,
num_urls / total_urls AS pct_urls,
sample_urls
FROM (
SELECT
yyyymmdd AS date,
client,
id,
feature,
type,
COUNT(DISTINCT url) AS num_urls,
ARRAY_AGG(url ORDER BY rank, url LIMIT 100) AS sample_urls
FROM ${ctx.ref("blink_features", "features")}
WHERE
yyyymmdd = '${constants.current_month}'
${constants.dev_rank_filter}
GROUP BY
yyyymmdd,
client,
id,
feature,
type
)
JOIN (
SELECT
date,
client,
COUNT(DISTINCT page) AS total_urls
FROM ${ctx.ref("all", "pages")}
WHERE
date = '${constants.current_month}' AND
is_root_page = TRUE
${constants.dev_rank_filter}
GROUP BY
date,
client
)
USING (date, client)
`)
15 changes: 10 additions & 5 deletions definitions/output/core_web_vitals/technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ technologies AS (
${ctx.resolve("all", "pages")},
UNNEST(technologies) AS technology
WHERE
date = '${past_month}' ${constants.dev_rank_filter} AND
date = '${past_month}'
${constants.dev_rank_filter} AND
technology.technology IS NOT NULL AND
technology.technology != ''
UNION ALL
Expand All @@ -125,7 +126,8 @@ UNION ALL
FROM
${ctx.resolve("all", "pages")}
WHERE
date = '${past_month}' ${constants.dev_rank_filter}
date = '${past_month}'
${constants.dev_rank_filter}
),
categories AS (
Expand All @@ -137,7 +139,8 @@ categories AS (
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '${past_month}' ${constants.dev_rank_filter}
date = '${past_month}'
${constants.dev_rank_filter}
GROUP BY
app
UNION ALL
Expand All @@ -149,8 +152,9 @@ UNION ALL
UNNEST(technologies) AS technology,
UNNEST(technology.categories) AS category
WHERE
date = '${past_month}' ${constants.dev_rank_filter} AND
date = '${past_month}' AND
client = 'mobile'
${constants.dev_rank_filter}
),
summary_stats AS (
Expand All @@ -165,7 +169,8 @@ summary_stats AS (
FROM
${ctx.resolve("all", "pages")}
WHERE
date = '${past_month}' ${constants.dev_rank_filter}
date = '${past_month}'
${constants.dev_rank_filter}
),
lab_data AS (
Expand Down
3 changes: 2 additions & 1 deletion definitions/output/technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ UNNEST (tech.categories) AS category,
UNNEST (tech.info) AS info
WHERE date = '${constants.current_month}' AND
client = '${client}' AND
is_root_page AND
is_root_page
${constants.dev_rank_filter} AND
tech.technology IS NOT NULL
`);
})
2 changes: 1 addition & 1 deletion includes/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const
dev_rank_filter
] = dataform.projectConfig.vars.env_name == 'dev' ? [
"TABLESAMPLE SYSTEM (0.001 PERCENT)",
"AND rank = 5000",
"AND rank <= 10000",
true
] : ["", ""];

Expand Down
2 changes: 1 addition & 1 deletion src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ FROM (
action: "runDataformRepo",
actionArgs: {
repoName: "crawl-data",
tags: ["crawl_results_all", "crawl_results_legacy"]
tags: ["crawl_results_all", "blink_features_report", "crawl_results_legacy"]
}
}
};
Expand Down

0 comments on commit 65f098c

Please sign in to comment.