-
Notifications
You must be signed in to change notification settings - Fork 0
Reorganize the BigQuery datasets to be more efficient #15
Comments
Brainstorming new tables/schemas.
|
Generating May 2022 data for CREATE TEMP FUNCTION GET_CUSTOM_METRICS(payload STRING) RETURNS STRING LANGUAGE js AS '''
const $ = JSON.parse(payload);
return JSON.stringify(Object.fromEntries($._custom.map(name => {
let value = $[`_${name}`];
if (typeof value == 'string') {
try {
value = JSON.parse(value);
} catch (e) {
// The value is not a JSON string.
}
}
return [name, value];
})));
''';
CREATE TEMP FUNCTION GET_FEATURES(payload STRING)
RETURNS ARRAY<STRUCT<feature STRING, id STRING, type STRING>> LANGUAGE js AS
'''
function getFeatureNames(featureMap, featureType) {
try {
return Object.entries(featureMap).map(([key, value]) => {
// After Feb 2020 keys are feature IDs.
if (value.name) {
return {'feature': value.name, 'type': featureType, 'id': key};
}
// Prior to Feb 2020 keys fell back to IDs if the name was unknown.
if (idPattern.test(key)) {
return {'feature': '', 'type': featureType, 'id': key.match(idPattern)[1]};
}
// Prior to Feb 2020 keys were names by default.
return {'feature': key, 'type': featureType, 'id': ''};
});
} catch (e) {
return [];
}
}
var $ = JSON.parse(payload);
if (!$._blinkFeatureFirstUsed) return [];
var idPattern = new RegExp('^Feature_(\\d+)$');
return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default')
.concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css'))
.concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css'));
''';
WITH pages AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
JSON_VALUE(payload, '$._metadata.rank') AS rank,
JSON_VALUE(payload, '$._metadata.crawl_depth') = '0' AS is_root_page,
JSON_VALUE(payload, '$._testID') AS wptid,
GET_CUSTOM_METRICS(payload) AS custom_metrics,
JSON_QUERY(payload, '$._metadata') AS metadata,
payload
FROM
`httparchive.pages.2022_05_01_*`
# TODO: Backfill when the summary pages is ready.
/* ), summary_pages AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
TO_JSON_STRING(summary_pages) AS summary
FROM
`httparchive.summary_pages.2022_05_01_*` AS summary_pages */
), loose_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
STRUCT(
app AS technology,
ARRAY_AGG(DISTINCT category ORDER BY category) AS categories,
ARRAY_AGG(info) AS info
) AS technology
FROM
`httparchive.technologies.2022_05_01_*`
GROUP BY
client,
page,
app
), techs AS (
SELECT
client,
page,
ARRAY_AGG(technology) AS technologies
FROM
loose_technologies
GROUP BY
client,
page
), lh AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
report AS lighthouse
FROM
`httparchive.lighthouse.2022_05_01_*`
)
SELECT
DATE('2022-05-01') AS date,
client,
page,
is_root_page,
rank,
wptid,
payload,
# TODO: Update when the summary pipeline completes successfully.
'' AS summary,
custom_metrics,
lighthouse,
GET_FEATURES(payload) AS features,
technologies,
metadata
FROM
pages
LEFT JOIN
techs
USING
(client, page)
LEFT JOIN
lh
USING
(client, page) |
Pages data for April 2022. Changed CREATE TEMP FUNCTION GET_CUSTOM_METRICS(payload STRING) RETURNS STRING LANGUAGE js AS '''
const $ = JSON.parse(payload);
return JSON.stringify(Object.fromEntries($._custom.map(name => {
let value = $[`_${name}`];
if (typeof value == 'string') {
try {
value = JSON.parse(value);
} catch (e) {
// The value is not a JSON string.
}
}
return [name, value];
})));
''';
CREATE TEMP FUNCTION GET_FEATURES(payload STRING)
RETURNS ARRAY<STRUCT<feature STRING, id STRING, type STRING>> LANGUAGE js AS
'''
function getFeatureNames(featureMap, featureType) {
try {
return Object.entries(featureMap).map(([key, value]) => {
// After Feb 2020 keys are feature IDs.
if (value.name) {
return {'feature': value.name, 'type': featureType, 'id': key};
}
// Prior to Feb 2020 keys fell back to IDs if the name was unknown.
if (idPattern.test(key)) {
return {'feature': '', 'type': featureType, 'id': key.match(idPattern)[1]};
}
// Prior to Feb 2020 keys were names by default.
return {'feature': key, 'type': featureType, 'id': ''};
});
} catch (e) {
return [];
}
}
var $ = JSON.parse(payload);
if (!$._blinkFeatureFirstUsed) return [];
var idPattern = new RegExp('^Feature_(\\d+)$');
return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default')
.concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css'))
.concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css'));
''';
WITH pages AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
SAFE_CAST(JSON_VALUE(payload, '$._metadata.rank') AS INT64) AS rank,
COALESCE(JSON_VALUE(payload, '$._metadata.crawl_depth') = '0', TRUE) AS is_root_page,
COALESCE(JSON_VALUE(payload, '$._metadata.root_page_url'), url) AS root_page,
JSON_VALUE(payload, '$._testID') AS wptid,
GET_CUSTOM_METRICS(payload) AS custom_metrics,
JSON_QUERY(payload, '$._metadata') AS metadata,
payload
FROM
`httparchive.pages.2022_04_01_*`
), summary_pages AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
rank,
TO_JSON_STRING(summary_pages) AS summary
FROM
`httparchive.summary_pages.2022_04_01_*` AS summary_pages
), loose_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
STRUCT(
app AS technology,
ARRAY_AGG(DISTINCT category ORDER BY category) AS categories,
ARRAY_AGG(info) AS info
) AS technology
FROM
`httparchive.technologies.2022_04_01_*`
GROUP BY
client,
page,
app
), techs AS (
SELECT
client,
page,
ARRAY_AGG(technology) AS technologies
FROM
loose_technologies
GROUP BY
client,
page
), lh AS (
SELECT
_TABLE_SUFFIX AS client,
url AS page,
report AS lighthouse
FROM
`httparchive.lighthouse.2022_04_01_*`
)
SELECT
DATE('2022-04-01') AS date,
client,
page,
is_root_page,
root_page,
COALESCE(pages.rank, summary_pages.rank) AS rank,
wptid,
payload,
summary,
custom_metrics,
lighthouse,
GET_FEATURES(payload) AS features,
technologies,
metadata
FROM
pages
LEFT JOIN
summary_pages
USING
(client, page)
LEFT JOIN
techs
USING
(client, page)
LEFT JOIN
lh
USING
(client, page) |
Ran into OOM issues with generating the Since we'll eventually need to generate tables in the new I've successfully tested it on a single HAR and now attempting a full-scale test on the entire 2022_05_12 crawl, both desktop and mobile concurrently. I expect it to take 6-8 hours (running for ~2 so far). |
The
As well as implicitly by the We're only allowed 4 cluster fields meaning we cannot add any more. However I'm wondering if WDYT? |
|
If I query for SELECT
JSON_VALUE(lighthouse, '$.finalUrl') AS final_url,
FROM `httparchive.all.pages`
WHERE
date = "2022-10-01"
AND client = 'mobile'
AND is_root_page The BQ UI warns me I'm going to be querying 13.65 TB, vs 4.63 TB for the (currently) equivalent SELECT
JSON_VALUE(report, '$.finalUrl') AS final_url,
FROM `httparchive.lighthouse.2022_10_01_mobile` Is there some partitioning/clustering trick I should be doing to bring the |
The old BQ estimation behavior was to not take clustered fields into consideration. I'm also seeing the new behavior showing the 4.67 TB estimate, so maybe they've only partially rolled it out. |
via the CLI:
also gives me "this query will process upper bound of 15012318480052 bytes of data" (aka 13.65 TiB). Can either of you try it with a secondary cloud project to see if you get the same numbers? If everyone eventually gets the lower number, obviously this isn't an actual issue. |
I see 13.65TB when using a random account. Hmm.... |
If you run the query it should still only process 4.67 TiB |
Well, here is the current plan for this reorganization: https://docs.google.com/document/d/1kNCpVgvC0W77HJzXiNvlchuGjCCY6uymIfv0nH10tF0/edit |
Closing as most of the steps were implemented. We have planned legacy data access deprecation (and cleanup) in March 2025. |
Similar to the
almanac
dataset, we want the monthly results to be queryable in partitioned and clustered tables.We'll need a deprecation plan to avoid suddenly breaking existing queries:
All
pipeline improvements #109generate_reports
)The text was updated successfully, but these errors were encountered: