diff --git a/README.md b/README.md index 21eb733..e3a8fed 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # HTTP Archive BigQuery pipeline with Dataform -## Datasets +This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery. + +## Pipelines + +The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion and other events. The code in the `main` branch is used on each triggered pipeline run. ### Crawl results @@ -70,3 +74,11 @@ Tag: `crawl_results_legacy` 1. In workflow settings vars set `dev_name: dev` to process sampled data in dev workspace. 2. Change `today` variable to a date in the past. May be helpful for testing pipelines based on `chrome-ux-report` data. 3. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default. + +### Error Monitoring + +The issues within the pipeline are being tracked using the following alerts: +1. the event trigger processing fails - [Dataform Trigger Function Error](https://console.cloud.google.com/monitoring/alerting/policies/3950167380893746326?authuser=7&project=httparchive) +2. a job in the workflow fails - "[Dataform Workflow Invocation Failed](https://console.cloud.google.com/monitoring/alerting/policies/7137542315653007241?authuser=7&project=httparchive) + +Error notifications are sent to [#10x-infra](https://httparchive.slack.com/archives/C030V4WAVL3) Slack channel. diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index 05bbd20..e809517 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,45 +1,26 @@ -const past_month = constants.fn_past_month(constants.current_month); +const date = constants.fn_past_month(constants.current_month); -operate("test_env", { - hasOutput: true, - disabled: true // MUST be disabled in main branch -}).queries(ctx => ` -CREATE SCHEMA IF NOT EXISTS all_dev; +var resources_list = [ + {datasetId: "all", tableId: "pages"}, + {datasetId: "all", tableId: "requests"}, + //{datasetId: "all", tableId: "parsed_css"}, + //{datasetId: "core_web_vitals", tableId: "technologies"}, +]; -CREATE TABLE IF NOT EXISTS ${ctx.resolve("all", "pages")} AS -SELECT * -FROM httparchive.all.pages -WHERE - date = '${constants.current_month}' - ${constants.dev_rank5000_filter}; - -CREATE TABLE IF NOT EXISTS ${ctx.resolve("all", "requests")} AS -SELECT * -FROM httparchive.all.requests ${constants.dev_TABLESAMPLE} -WHERE date = '${constants.current_month}'; - -CREATE TABLE IF NOT EXISTS ${ctx.resolve("all", "parsed_css")} AS -SELECT * -FROM httparchive.all.parsed_css -WHERE date = '${constants.current_month}' - ${constants.dev_rank5000_filter}; +resources_list.forEach(resource => { + operate(`test_table ${resource.datasetId}_${resource.tableId}`, { + hasOutput: true + }).queries(ctx => ` +CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev; -CREATE SCHEMA IF NOT EXISTS core_web_vitals_dev; +DROP TABLE ${resource.datasetId}_dev.dev_${resource.tableId}; -CREATE TABLE IF NOT EXISTS ${ctx.resolve("core_web_vitals", "technologies")} AS -SELECT * -FROM httparchive.core_web_vitals.technologies ${constants.dev_TABLESAMPLE} -WHERE date = '${past_month}'; - -CREATE SCHEMA IF NOT EXISTS blink_features_dev; - -CREATE TABLE IF NOT EXISTS ${ctx.resolve("blink_features", "usage")} AS -SELECT * -FROM httparchive.blink_features.usage ${constants.dev_TABLESAMPLE} -WHERE yyyymmdd = '${past_month}'; +CREATE TABLE ${resource.datasetId}_dev.dev_${resource.tableId} +LIKE httparchive.${resource.datasetId}.${resource.tableId}; -CREATE TABLE IF NOT EXISTS ${ctx.resolve("blink_features", "features")} AS +INSERT INTO ${resource.datasetId}_dev.dev_${resource.tableId} SELECT * -FROM httparchive.blink_features.features ${constants.dev_TABLESAMPLE} -WHERE yyyymmdd = DATE '${past_month}'; -`) +FROM httparchive.${resource.datasetId}.${resource.tableId} ${constants.dev_TABLESAMPLE} +WHERE date = '${date}' + `); +}); diff --git a/definitions/output/all/reprocess_pages.js b/definitions/output/all/reprocess_pages.js new file mode 100644 index 0000000..5f5c4f2 --- /dev/null +++ b/definitions/output/all/reprocess_pages.js @@ -0,0 +1,267 @@ +operate(`all_pages_stable_pre`).tags( + ["all_pages_stable"] +).queries(` +CREATE SCHEMA IF NOT EXISTS all_dev; + +DROP TABLE IF EXISTS \`all_dev.pages_stable\`; + +CREATE TABLE \`all_dev.pages_stable\` +( + date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"), + client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"), + page STRING NOT NULL OPTIONS(description="The URL of the page being tested"), + is_root_page BOOL NOT NULL OPTIONS(description="Whether the page is the root of the origin"), + root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested, the origin followed by /"), + rank INT64 OPTIONS(description="Site popularity rank, from CrUX"), + wptid STRING OPTIONS(description="ID of the WebPageTest results"), + payload JSON OPTIONS(description="JSON-encoded WebPageTest results for the page"), + summary JSON OPTIONS(description="JSON-encoded summarization of the page-level data"), + custom_metrics STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + > OPTIONS(description="Custom metrics from WebPageTest"), + lighthouse JSON OPTIONS(description="JSON-encoded Lighthouse report"), + features ARRAY> OPTIONS(description="Blink features detected at runtime (see https://chromestatus.com/features)"), + technologies ARRAY OPTIONS(description="List of categories to which this technology belongs"), + info ARRAY OPTIONS(description="Additional metadata about the detected technology, ie version number") + >> OPTIONS(description="Technologies detected at runtime (see https://www.wappalyzer.com/)"), + metadata JSON OPTIONS(description="Additional metadata about the test") +) +PARTITION BY date +CLUSTER BY client, is_root_page, rank, page +OPTIONS( + require_partition_filter=true +); +`); + + +const iterations = []; +const clients = constants.clients; + +for ( + let month = constants.current_month; + month >= '2024-09-01'; // 2022-07-01 + month = constants.fn_past_month(month)) { + clients.forEach((client) => { + iterations.push({ + month: month, + client: client + }) + }) +} + +iterations.forEach((iteration, i) => { + operate(`all_pages_stable_update ${iteration.month} ${iteration.client}`).tags([ + "all_pages_stable" + ]).dependencies([ + i===0 ? "all_pages_stable_pre" : `all_pages_stable_update ${iterations[i-1].month} ${iterations[i-1].client}` + ]).queries(ctx => ` +INSERT INTO \`all_dev.pages_stable\` +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + JSON_REMOVE( + SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + JSON_SET( + JSON_REMOVE( + SAFE.PARSE_JSON(summary, wide_number_mode => 'round'), + '$._adult_site', + '$.archive', + '$.avg_dom_depth', + '$.crawlid', + '$.createDate', + '$.doctype', + '$.document_height', + '$.document_width', + '$.label', + '$.localstorage_size', + '$.meta_viewport', + '$.metadata', + '$.num_iframes', + '$.num_scripts_async', + '$.num_scripts_sync', + '$.num_scripts', + '$.pageid', + '$.PageSpeed', + '$.rank', + '$.sessionstorage_size', + '$.startedDateTime', + '$.url', + '$.urlhash', + '$.urlShort', + '$.usertiming', + '$.wptid', + '$.wptrun' + ), + "$.crux", + JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._CrUX") + ) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.a11y"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.cms"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.cookies"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.css-variables"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.ecommerce"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.element_count"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.javascript"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.markup"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.media"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.origin-trials"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.performance"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.privacy"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.responsive_images"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.robots_txt"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.security"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.structured-data"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.third-parties"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.well-known"), + JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), "$.wpt_bodies"), + JSON_REMOVE( + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), + '$.a11y', + '$.cms', + '$.cookies', + '$.css-variables', + '$.ecommerce', + '$.element_count', + '$.javascript', + '$.markup', + '$.media', + '$.origin-trials', + '$.performance', + '$.privacy', + '$.responsive_images', + '$.robots_txt', + '$.security', + '$.structured-data', + '$.third-parties', + '$.well-known', + '$.wpt_bodies' + ) + ) AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + features, + technologies, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata +FROM \`all.pages\` +WHERE + date = "${iteration.month}" AND + client = "${iteration.client}" ${constants.dev_rank_filter}; + `) +}) \ No newline at end of file diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js new file mode 100644 index 0000000..af0149d --- /dev/null +++ b/definitions/output/all/reprocess_requests.js @@ -0,0 +1,122 @@ +operate(`all_requests_stable_pre`).tags( + ["all_requests_stable"] +).queries(` +CREATE SCHEMA IF NOT EXISTS all_dev; + +DROP TABLE IF EXISTS \`all_dev.requests_stable\`; + +CREATE TABLE \`all_dev.requests_stable\` +( + date DATE NOT NULL OPTIONS(description="YYYY-MM-DD format of the HTTP Archive monthly crawl"), + client STRING NOT NULL OPTIONS(description="Test environment: desktop or mobile"), + page STRING NOT NULL OPTIONS(description="The URL of the page being tested"), + is_root_page BOOL OPTIONS(description="Whether the page is the root of the origin."), + root_page STRING NOT NULL OPTIONS(description="The URL of the root page being tested"), + rank INT64 OPTIONS(description="Site popularity rank, from CrUX"), + url STRING NOT NULL OPTIONS(description="The URL of the request"), + is_main_document BOOL NOT NULL OPTIONS(description="Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects"), + type STRING OPTIONS(description="Simplified description of the type of resource (script, html, css, text, other, etc)"), + index INT64 OPTIONS(description="The sequential 0-based index of the request"), + payload JSON OPTIONS(description="JSON-encoded WebPageTest result data for this request"), + summary JSON OPTIONS(description="JSON-encoded summarization of request data"), + request_headers ARRAY> OPTIONS(description="Request headers"), + response_headers ARRAY> OPTIONS(description="Response headers"), + response_body STRING OPTIONS(description="Text-based response body") +) +PARTITION BY date +CLUSTER BY client, is_root_page, type, rank +OPTIONS( + require_partition_filter=true +); +`); + +const iterations = []; +const clients = constants.clients; + +for ( + let month = constants.current_month; + month >= '2024-09-01'; // 2022-07-01 + month = constants.fn_past_month(month)) { + clients.forEach((client) => { + iterations.push({ + month: month, + client: client + }) + }) +} + +iterations.forEach((iteration, i) => { + operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags( + ["all_requests_stable"] + ).dependencies([ + i===0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i-1].month} ${iterations[i-1].client}` + ]).queries(ctx => ` +CREATE TEMP FUNCTION PRUNE_HEADERS( + jsonObject JSON +) RETURNS JSON +LANGUAGE js AS """ +try { + for (const [key, value] of Object.entries(jsonObject)) { + if(key.startsWith('req_') || key.startsWith('resp_')) { + delete jsonObject[key]; + } + } + return jsonObject; +} catch (e) { + return null; +} +"""; + +INSERT INTO \`all_dev.requests_stable\` +SELECT + requests.date, + requests.client, + requests.page, + requests.is_root_page, + requests.root_page, + crux.rank, + requests.url, + requests.is_main_document, + requests.type, + requests.index, + JSON_REMOVE( + SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + '$._headers', + '$.request.headers', + '$.response.headers' + ) AS payload, + PRUNE_HEADERS(JSON_REMOVE( + SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'), + '$.firstHtml', + '$.firstReq', + '$.reqOtherHeaders', + '$.requestid', + '$.respOtherHeaders', + '$.startedDateTime', + '$.url', + '$.urlShort' + )) as summary, + requests.request_headers, + requests.response_headers, + requests.response_body +FROM ( + SELECT * + FROM \`all.requests\` ${constants.dev_TABLESAMPLE} + WHERE date = '${iteration.month}' + AND client = '${iteration.client}') AS requests +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + WHERE yyyymm = ${constants.fn_past_month(iteration.month).substring(0, 7).replace('-', '')} +) AS crux +ON requests.root_page = crux.page; + `) +}); diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 111a586..49c578f 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -24,7 +24,7 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F ); CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING) -RETURNS STRUCT +RETURNS STRUCT LANGUAGE js AS ''' try { const $ = JSON.parse(categories); @@ -74,7 +74,7 @@ crux AS ( END AS rank, CONCAT(origin, '/') AS root_page_url, IF(device = 'desktop', 'desktop', 'mobile') AS client, - + # CWV IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid, IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid, @@ -82,15 +82,15 @@ crux AS ( IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls, IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp, IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp, - + (IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND IS_GOOD(small_cls, medium_cls, large_cls) AND IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024, - + (IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND IS_GOOD(small_cls, medium_cls, large_cls) AND IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023, - + # WV IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp, IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp, @@ -114,7 +114,7 @@ technologies AS ( ${ctx.resolve("all", "pages")}, UNNEST(technologies) AS technology WHERE - date = '${past_month}' AND + date = '${past_month}' ${constants.dev_rank5000_filter} AND technology.technology IS NOT NULL AND technology.technology != '' UNION ALL @@ -125,7 +125,7 @@ UNION ALL FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} ), categories AS ( @@ -137,7 +137,7 @@ categories AS ( UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} GROUP BY app UNION ALL @@ -149,7 +149,7 @@ UNION ALL UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE - date = '${past_month}' AND + date = '${past_month}' ${constants.dev_rank5000_filter} AND client = 'mobile' ), @@ -165,7 +165,7 @@ summary_stats AS ( FROM ${ctx.resolve("all", "pages")} WHERE - date = '${past_month}' + date = '${past_month}' ${constants.dev_rank5000_filter} ), lab_data AS ( @@ -206,7 +206,7 @@ SELECT app, client, COUNT(0) AS origins, - + # CrUX data COUNTIF(good_fid) AS origins_with_good_fid, COUNTIF(good_cls) AS origins_with_good_cls, @@ -227,19 +227,19 @@ SELECT SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv, SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024, SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023, - + # Lighthouse data APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS median_lighthouse_score_accessibility, APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS median_lighthouse_score_best_practices, APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS median_lighthouse_score_performance, APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS median_lighthouse_score_pwa, APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS median_lighthouse_score_seo, - + # Page weight stats APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS median_bytes_total, APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS median_bytes_js, APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS median_bytes_image - + FROM lab_data JOIN diff --git a/definitions/sources/declares.js b/definitions/sources/declares.js index ca29d1e..87eb47a 100644 --- a/definitions/sources/declares.js +++ b/definitions/sources/declares.js @@ -16,7 +16,7 @@ for (const table of crux_tables) { }); assert(`${table}_not_empty`).query(ctx => ` -SELECT +SELECT 'No data for the specified date' AS error_message FROM ${ctx.ref("chrome-ux-report", "materialized", table)} WHERE yyyymm = ${past_month} @@ -24,3 +24,9 @@ GROUP BY yyyymm HAVING COUNT(1) = 0 `); } + +declare({ + database: "chrome-ux-report", + schema: "experimental", + name: "global", +}); diff --git a/includes/constants.js b/includes/constants.js index 40054b2..fd4bed5 100644 --- a/includes/constants.js +++ b/includes/constants.js @@ -11,11 +11,13 @@ const booleans = ['TRUE', 'FALSE'], [ dev_TABLESAMPLE, - dev_rank5000_filter - ] = dataform.projectConfig.vars.env_name == "dev" ? [ + dev_rank_filter, + is_dev_env + ] = dataform.projectConfig.vars.env_name == 'dev' ? [ "TABLESAMPLE SYSTEM (0.001 PERCENT)", - "AND rank = 5000" - ] : ["", ""]; + "AND rank <= 10000", + true + ] : ["", "", false]; module.exports = { today, @@ -25,5 +27,6 @@ module.exports = { clients, booleans, dev_TABLESAMPLE, - dev_rank5000_filter + dev_rank_filter, + is_dev_env }; diff --git a/package-lock.json b/package-lock.json index 3fc3b9d..159edf4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,20 +6,20 @@ "": { "name": "crawl-data", "dependencies": { - "@dataform/core": "3.0.2" + "@dataform/core": "3.0.4" } }, "node_modules/@dataform/core": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.2.tgz", - "integrity": "sha512-W1DQuv1vSsQgUPlNXuBTqa927FfNp5pNswOTz0wlX3cJTk5OT0UEENVmVJu/WIEdoVcCRTn8pn2dNNc1IjtMcA==" + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.4.tgz", + "integrity": "sha512-qlCugswggeO5g0qAxn4U3Kt9dzebSiM8gXZmNM3NkVUXloJBwOpbagkQ9wDXymio1FnMC4qwQX7jniz1KzKHmg==" } }, "dependencies": { "@dataform/core": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.2.tgz", - "integrity": "sha512-W1DQuv1vSsQgUPlNXuBTqa927FfNp5pNswOTz0wlX3cJTk5OT0UEENVmVJu/WIEdoVcCRTn8pn2dNNc1IjtMcA==" + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@dataform/core/-/core-3.0.4.tgz", + "integrity": "sha512-qlCugswggeO5g0qAxn4U3Kt9dzebSiM8gXZmNM3NkVUXloJBwOpbagkQ9wDXymio1FnMC4qwQX7jniz1KzKHmg==" } } } diff --git a/package.json b/package.json index 5a97497..c98cb20 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "crawl-data", "dependencies": { - "@dataform/core": "3.0.2" + "@dataform/core": "3.0.4" } } diff --git a/src/dataform.js b/src/dataform.js index 5b8159b..19a02f7 100644 --- a/src/dataform.js +++ b/src/dataform.js @@ -14,18 +14,6 @@ async function get_compilation_results(repoURI) { compilationResult: { releaseConfig: `${repoURI}/releaseConfigs/production` } - }, dev_request = { - parent: repoURI, - compilationResult: { - gitCommitish: 'dev' - }, - codeCompilationConfig: { - schemaSuffix: 'dev', - tablePrefix: 'dev', - vars: { - current_month: '2024-08-01', - }, - } }; console.log(`Creating Dataform compilation result: ${JSON.stringify(request, null, 2)}`); diff --git a/src/package-lock.json b/src/package-lock.json index 525b592..a1f30be 100644 --- a/src/package-lock.json +++ b/src/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "@google-cloud/bigquery": "7.9.0", + "@google-cloud/bigquery": "7.9.1", "@google-cloud/dataform": "1.3.0", "@google-cloud/functions-framework": "3.4.2" }, @@ -51,10 +51,9 @@ } }, "node_modules/@google-cloud/bigquery": { - "version": "7.9.0", - "resolved": "https://registry.npmjs.org/@google-cloud/bigquery/-/bigquery-7.9.0.tgz", - "integrity": "sha512-KJTimGLDlAR1IfZ4Y8xhIVfoZ+XBXd0GGuJttLSXxtR0g+4vNsUt0xS33PRVa5TXey97374yU+uWNlCb5bHwBw==", - "license": "Apache-2.0", + "version": "7.9.1", + "resolved": "https://registry.npmjs.org/@google-cloud/bigquery/-/bigquery-7.9.1.tgz", + "integrity": "sha512-ZkcRMpBoFLxIh6TiQBywA22yT3c2j0f07AHWEMjtYqMQzZQbFrpxuJU2COp3tyjZ91ZIGHe4gY7/dGZL88cltg==", "dependencies": { "@google-cloud/common": "^5.0.0", "@google-cloud/paginator": "^5.0.2", diff --git a/src/package.json b/src/package.json index b8998b5..6e6a713 100644 --- a/src/package.json +++ b/src/package.json @@ -9,7 +9,7 @@ }, "dependencies": { "@google-cloud/functions-framework": "3.4.2", - "@google-cloud/bigquery": "7.9.0", + "@google-cloud/bigquery": "7.9.1", "@google-cloud/dataform": "1.3.0" } } diff --git a/workflow_settings.yaml b/workflow_settings.yaml index dcbbe42..4aeb4eb 100644 --- a/workflow_settings.yaml +++ b/workflow_settings.yaml @@ -5,5 +5,5 @@ defaultAssertionDataset: dataform_assertions vars: placeholder: value # keeping it to avoid 'empty vars' error - # env_name: dev # MUST be commented in main branch, enables processing sampled data - # today: 2024-09-01 # MUST be commented in main branch, allows processing historical data + env_name: dev # MUST be commented in main branch, enables processing sampled data + today: 2024-09-20 # MUST be commented in main branch, allows processing historical data