Skip to content

Commit

Permalink
Merge branch 'main' into infra-code
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko authored Oct 14, 2024
2 parents e1b244c + 1372eae commit d05ee3c
Show file tree
Hide file tree
Showing 34 changed files with 7,753 additions and 3,186 deletions.
Binary file removed .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.{md,js,sql,yaml}]
[*.{js,json,md,sql,yaml}]
indent_size = 2
34 changes: 34 additions & 0 deletions .github/workflows/linter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Linter

on:
workflow_dispatch:
pull_request: null
push:
branches:
- main

permissions:
contents: read
packages: read

jobs:
build:
name: Lint
runs-on: ubuntu-latest

permissions:
statuses: write # To report GitHub Actions status checks

steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Lint Code Base
uses: super-linter/super-linter/[email protected]
env:
DEFAULT_BRANCH: main
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
VALIDATE_JAVASCRIPT_PRETTIER: false
VALIDATE_MARKDOWN_PRETTIER: false
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,4 @@ node_modules/

# Terraform
tf/.terraform/
tf/temp

# Dataform
.df-credentials.json
tf/temp
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# HTTP Archive datasets pipeline

This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.
This repository handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.

## Pipelines

Expand Down Expand Up @@ -72,8 +72,10 @@ In order to unify the workflow triggering mechanism, we use [a Cloud Run functio
### Dataform development workspace hints

1. In workflow settings vars:
1. set `env_name: dev` to process sampled data in dev workspace.
2. change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.

- set `env_name: dev` to process sampled data in dev workspace.
- change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.

2. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default.

### Error Monitoring
Expand Down
46 changes: 23 additions & 23 deletions definitions/extra/test_env.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
const date = constants.current_month;

var resources_list = [{
datasetId: "all",
tableId: "pages"
},
{
datasetId: "all",
tableId: "requests"
},
//{datasetId: "all", tableId: "parsed_css"},
//{datasetId: "core_web_vitals", tableId: "technologies"},
];

resources_list.forEach(resource => {
const date = constants.currentMonth

const resourcesList = [{
datasetId: 'all',
tableId: 'pages'
},
{
datasetId: 'all',
tableId: 'requests'
}
// {datasetId: 'all', tableId: 'parsed_css'},
// {datasetId: 'core_web_vitals', tableId: 'technologies'},
]

resourcesList.forEach(resource => {
operate(
`test_table ${resource.datasetId}_${resource.tableId}`, {
hasOutput: true
Expand All @@ -24,31 +24,31 @@ DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId};
CREATE TABLE IF NOT EXISTS ${resource.datasetId}_dev.dev_${resource.tableId} AS
SELECT *
FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.dev_TABLESAMPLE}
FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.devTABLESAMPLE}
WHERE date = '${date}'
`);
`)
})

operate("test_table blink_features_dev_dev_usage", {
hasOutput: true,
operate('test_table blink_features_dev_dev_usage', {
hasOutput: true
}).queries(`
CREATE SCHEMA IF NOT EXISTS blink_features_dev;
CREATE TABLE IF NOT EXISTS blink_features_dev.dev_usage AS
SELECT *
FROM blink_features.usage ${constants.dev_TABLESAMPLE}
FROM blink_features.usage ${constants.devTABLESAMPLE}
WHERE yyyymmdd = '${date}';
`)

operate("test_table blink_features_dev_dev_features", {
hasOutput: true,
operate('test_table blink_features_dev_dev_features', {
hasOutput: true
}).queries(`
CREATE SCHEMA IF NOT EXISTS blink_features_dev;
DROP TABLE IF EXISTS blink_features_dev.dev_features;
CREATE TABLE IF NOT EXISTS blink_features_dev.dev_features AS
SELECT *
FROM blink_features.features ${constants.dev_TABLESAMPLE}
FROM blink_features.features ${constants.devTABLESAMPLE}
WHERE yyyymmdd = DATE '${date}';
`)
38 changes: 19 additions & 19 deletions definitions/output/all/pages.js
Original file line number Diff line number Diff line change
@@ -1,45 +1,45 @@
publish("pages", {
type: "incremental",
publish('pages', {
type: 'incremental',
protected: true,
schema: "all",
schema: 'all',
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank"],
partitionBy: 'date',
clusterBy: ['client', 'is_root_page', 'rank'],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
tags: ['crawl_results_all']
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
WHERE date = '${constants.currentMonth}';
`).query(ctx => `
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}'
AND client = 'desktop'
AND is_root_page = TRUE
${constants.dev_rank_filter}
${constants.devRankFilter}
`).postOps(ctx => `
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}'
AND client = 'desktop'
AND is_root_page = FALSE
${constants.dev_rank_filter};
${constants.devRankFilter};
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')} ${constants.devTABLESAMPLE}
WHERE date = '${constants.currentMonth}'
AND client = 'mobile'
AND is_root_page = TRUE
${constants.dev_rank_filter};
${constants.devRankFilter};
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}'
AND client = 'mobile'
AND is_root_page = FALSE
${constants.dev_rank_filter};
${constants.devRankFilter};
`)
26 changes: 13 additions & 13 deletions definitions/output/all/parsed_css.js
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
publish("parsed_css", {
type: "incremental",
publish('parsed_css', {
type: 'incremental',
protected: true,
schema: "all",
schema: 'all',
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank", "page"],
partitionBy: 'date',
clusterBy: ['client', 'is_root_page', 'rank', 'page'],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
tags: ['crawl_results_all']
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
WHERE date = '${constants.currentMonth}';
`).query(ctx => `
SELECT *
FROM ${ctx.ref("crawl_staging", "parsed_css")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'parsed_css')}
WHERE date = '${constants.currentMonth}'
AND client = 'desktop'
${constants.dev_rank_filter}
${constants.devRankFilter}
`).postOps(ctx => `
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "parsed_css")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'parsed_css')}
WHERE date = '${constants.currentMonth}'
AND client = 'mobile'
${constants.dev_rank_filter};
${constants.devRankFilter};
`)
Loading

0 comments on commit d05ee3c

Please sign in to comment.