Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linter #21

Merged
merged 17 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.{md,js,sql,yaml}]
[*.{js,json,md,sql,yaml}]
indent_size = 2
34 changes: 34 additions & 0 deletions .github/workflows/linter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Linter

on:
workflow_dispatch:
pull_request: null
push:
branches:
- main

permissions:
contents: read
packages: read

jobs:
build:
name: Lint
runs-on: ubuntu-latest

permissions:
statuses: write # To report GitHub Actions status checks

steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Lint Code Base
uses: super-linter/super-linter/[email protected]
env:
DEFAULT_BRANCH: main
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
VALIDATE_JAVASCRIPT_PRETTIER: false
VALIDATE_MARKDOWN_PRETTIER: false
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
node_modules/
.df-credentials.json
.DS_Store

# Terraform
tf/.terraform/
tf/temp

# Dataform
.df-credentials.json
.gitignore
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# HTTP Archive BigQuery pipeline with Dataform

This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.
This repository handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.

## Pipelines

Expand Down Expand Up @@ -72,8 +72,10 @@ Tag: `crawl_results_legacy`
### Dataform development workspace hints

1. In workflow settings vars:
1. set `env_name: dev` to process sampled data in dev workspace.
2. change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.

- set `env_name: dev` to process sampled data in dev workspace.
- change `today` variable to a month in the past. May be helpful for testing pipelines based on `chrome-ux-report` data.

2. `definitions/extra/test_env.sqlx` script helps to setup the tables required to run pipelines when in dev workspace. It's disabled by default.

### Error Monitoring
Expand Down
46 changes: 23 additions & 23 deletions definitions/extra/test_env.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
const date = constants.current_month;

var resources_list = [{
datasetId: "all",
tableId: "pages"
},
{
datasetId: "all",
tableId: "requests"
},
//{datasetId: "all", tableId: "parsed_css"},
//{datasetId: "core_web_vitals", tableId: "technologies"},
];

resources_list.forEach(resource => {
const date = constants.currentMonth

const resourcesList = [{
datasetId: 'all',
tableId: 'pages'
},
{
datasetId: 'all',
tableId: 'requests'
}
// {datasetId: 'all', tableId: 'parsed_css'},
// {datasetId: 'core_web_vitals', tableId: 'technologies'},
]

resourcesList.forEach(resource => {
operate(
`test_table ${resource.datasetId}_${resource.tableId}`, {
hasOutput: true
Expand All @@ -24,31 +24,31 @@ DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId};

CREATE TABLE IF NOT EXISTS ${resource.datasetId}_dev.dev_${resource.tableId} AS
SELECT *
FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.dev_TABLESAMPLE}
FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.devTABLESAMPLE}
WHERE date = '${date}'
`);
`)
})

operate("test_table blink_features_dev_dev_usage", {
hasOutput: true,
operate('test_table blink_features_dev_dev_usage', {
hasOutput: true
}).queries(`
CREATE SCHEMA IF NOT EXISTS blink_features_dev;

CREATE TABLE IF NOT EXISTS blink_features_dev.dev_usage AS
SELECT *
FROM blink_features.usage ${constants.dev_TABLESAMPLE}
FROM blink_features.usage ${constants.devTABLESAMPLE}
WHERE yyyymmdd = '${date}';
`)

operate("test_table blink_features_dev_dev_features", {
hasOutput: true,
operate('test_table blink_features_dev_dev_features', {
hasOutput: true
}).queries(`
CREATE SCHEMA IF NOT EXISTS blink_features_dev;

DROP TABLE IF EXISTS blink_features_dev.dev_features;

CREATE TABLE IF NOT EXISTS blink_features_dev.dev_features AS
SELECT *
FROM blink_features.features ${constants.dev_TABLESAMPLE}
FROM blink_features.features ${constants.devTABLESAMPLE}
WHERE yyyymmdd = DATE '${date}';
`)
38 changes: 19 additions & 19 deletions definitions/output/all/pages.js
Original file line number Diff line number Diff line change
@@ -1,45 +1,45 @@
publish("pages", {
type: "incremental",
publish('pages', {
type: 'incremental',
protected: true,
schema: "all",
schema: 'all',
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank"],
partitionBy: 'date',
clusterBy: ['client', 'is_root_page', 'rank'],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
tags: ['crawl_results_all']
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
WHERE date = '${constants.currentMonth}';
`).query(ctx => `
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}'
AND client = 'desktop'
AND is_root_page = TRUE
${constants.dev_rank_filter}
${constants.devRankFilter}
`).postOps(ctx => `
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}'
AND client = 'desktop'
AND is_root_page = FALSE
${constants.dev_rank_filter};
${constants.devRankFilter};

INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")} ${constants.dev_TABLESAMPLE}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')} ${constants.devTABLESAMPLE}
WHERE date = '${constants.currentMonth}'
AND client = 'mobile'
AND is_root_page = TRUE
${constants.dev_rank_filter};
${constants.devRankFilter};

INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "pages")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}'
AND client = 'mobile'
AND is_root_page = FALSE
${constants.dev_rank_filter};
${constants.devRankFilter};
`)
26 changes: 13 additions & 13 deletions definitions/output/all/parsed_css.js
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
publish("parsed_css", {
type: "incremental",
publish('parsed_css', {
type: 'incremental',
protected: true,
schema: "all",
schema: 'all',
bigquery: {
partitionBy: "date",
clusterBy: ["client", "is_root_page", "rank", "page"],
partitionBy: 'date',
clusterBy: ['client', 'is_root_page', 'rank', 'page'],
requirePartitionFilter: true
},
tags: ["crawl_results_all"],
tags: ['crawl_results_all']
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${constants.current_month}';
WHERE date = '${constants.currentMonth}';
`).query(ctx => `
SELECT *
FROM ${ctx.ref("crawl_staging", "parsed_css")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'parsed_css')}
WHERE date = '${constants.currentMonth}'
AND client = 'desktop'
${constants.dev_rank_filter}
${constants.devRankFilter}
`).postOps(ctx => `
INSERT INTO ${ctx.self()}
SELECT *
FROM ${ctx.ref("crawl_staging", "parsed_css")}
WHERE date = '${constants.current_month}'
FROM ${ctx.ref('crawl_staging', 'parsed_css')}
WHERE date = '${constants.currentMonth}'
AND client = 'mobile'
${constants.dev_rank_filter};
${constants.devRankFilter};
`)
Loading