From 8a897fb9e9dbeae6a6f0002b14135963239da97b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 21 Dec 2024 17:33:19 +0100 Subject: [PATCH] direct and partitioned --- definitions/output/blink_features/features.js | 37 -------------- definitions/output/blink_features/usage.js | 49 ++++++++++++------- 2 files changed, 31 insertions(+), 55 deletions(-) delete mode 100644 definitions/output/blink_features/features.js diff --git a/definitions/output/blink_features/features.js b/definitions/output/blink_features/features.js deleted file mode 100644 index d609fbd..0000000 --- a/definitions/output/blink_features/features.js +++ /dev/null @@ -1,37 +0,0 @@ -publish('features', { - schema: 'blink_features', - type: 'incremental', - protected: true, - bigquery: { - partitionBy: 'yyyymmdd', - clusterBy: ['client', 'rank'] - }, - tags: ['crawl_complete'] -}).preOps(ctx => ` -DELETE FROM ${ctx.self()} -WHERE yyyymmdd = DATE '${constants.currentMonth}'; -`).query(ctx => ` -SELECT - date AS yyyymmdd, - client, - url, - feature.feature AS feature, - feature.type, - feature.id, - rank -FROM ( - SELECT - date, - client, - page AS url, - payload, - rank, - feature - FROM ${ctx.ref('crawl', 'pages')}, - UNNEST(features) AS feature - WHERE - date = '${constants.currentMonth}' AND - is_root_page = TRUE - ${constants.devRankFilter} -) -`) diff --git a/definitions/output/blink_features/usage.js b/definitions/output/blink_features/usage.js index 2e483ab..6fea48c 100644 --- a/definitions/output/blink_features/usage.js +++ b/definitions/output/blink_features/usage.js @@ -2,13 +2,32 @@ publish('usage', { schema: 'blink_features', type: 'incremental', protected: true, + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'feature'], + requirePartitionFilter: true + }, tags: ['crawl_complete'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} -WHERE yyyymmdd = REPLACE('${constants.currentMonth}', '-', ''); +WHERE date = '${constants.currentMonth}'; `).query(ctx => ` +WITH pages AS ( SELECT - REPLACE(CAST(date AS STRING), '-', '') AS yyyymmdd, + date, + client, + page, + rank, + features +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${constants.currentMonth}' AND + is_root_page = TRUE + ${constants.devRankFilter} +) + +SELECT + date, client, id, feature, @@ -19,19 +38,17 @@ SELECT sample_urls FROM ( SELECT - yyyymmdd AS date, + date, client, - id, - feature, - type, - COUNT(DISTINCT url) AS num_urls, - ARRAY_AGG(url ORDER BY rank, url LIMIT 100) AS sample_urls - FROM ${ctx.ref('blink_features', 'features')} - WHERE - yyyymmdd = '${constants.currentMonth}' - ${constants.devRankFilter} + feature.id, + feature.feature, + feature.type, + COUNT(DISTINCT page) AS num_urls, + ARRAY_AGG(page ORDER BY rank, page LIMIT 100) AS sample_urls + FROM pages, + UNNEST(features) AS feature GROUP BY - yyyymmdd, + date, client, id, feature, @@ -42,11 +59,7 @@ JOIN ( date, client, COUNT(DISTINCT page) AS total_urls - FROM ${ctx.ref('crawl', 'pages')} - WHERE - date = '${constants.currentMonth}' AND - is_root_page = TRUE - ${constants.devRankFilter} + FROM pages GROUP BY date, client