Skip to content

Commit f89088c

Browse files
Merge branch 'main' into excess-aphid
2 parents 315eb7b + 88387c6 commit f89088c

File tree

4 files changed

+216
-47
lines changed

4 files changed

+216
-47
lines changed
+36-11
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,42 @@
1-
const stagingTables = ['pages', 'requests', 'parsed_css']
2-
for (const table of stagingTables) {
1+
// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py
2+
['pages', 'requests', 'parsed_css'].forEach(table =>
33
declare({
44
schema: 'crawl_staging',
55
name: table
66
})
7-
}
7+
)
88

9-
declare({
10-
schema: 'wappalyzer',
11-
name: 'technologies'
12-
})
9+
// See https://github.com/HTTPArchive/dataform/issues/43
10+
assert('corrupted_technology_values')
11+
.tags(['crawl_complete'])
12+
.query(ctx => `
13+
SELECT
14+
date,
15+
client,
16+
tech,
17+
COUNT(DISTINCT page) AS cnt_pages,
18+
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
19+
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
20+
LEFT JOIN pages.technologies AS tech
21+
LEFT JOIN tech.categories AS category
22+
WHERE
23+
date = '${constants.currentMonth}' AND
24+
(
25+
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
26+
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
27+
OR ARRAY_LENGTH(tech.categories) = 0
28+
)
29+
GROUP BY
30+
date,
31+
client,
32+
tech
33+
ORDER BY cnt_pages DESC
34+
`);
1335

14-
declare({
15-
schema: 'wappalyzer',
16-
name: 'categories'
17-
})
36+
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
37+
['technologies', 'categories'].forEach(table =>
38+
declare({
39+
schema: 'wappalyzer',
40+
name: table
41+
})
42+
)

definitions/output/crawl/pages.js

+79-5
Original file line numberDiff line numberDiff line change
@@ -52,23 +52,97 @@ publish('pages', {
5252
DELETE FROM ${ctx.self()}
5353
WHERE date = '${constants.currentMonth}' AND
5454
client = 'desktop';
55-
`).query(ctx => `
55+
56+
INSERT INTO ${ctx.self()}
5657
SELECT
5758
*
5859
FROM ${ctx.ref('crawl_staging', 'pages')}
5960
WHERE date = '${constants.currentMonth}' AND
6061
client = 'desktop'
61-
${constants.devRankFilter}
62-
`).postOps(ctx => `
62+
${constants.devRankFilter};
63+
6364
DELETE FROM ${ctx.self()}
6465
WHERE date = '${constants.currentMonth}' AND
6566
client = 'mobile';
66-
67-
INSERT INTO ${ctx.self()}
67+
`).query(ctx => `
6868
SELECT
6969
*
7070
FROM ${ctx.ref('crawl_staging', 'pages')}
7171
WHERE date = '${constants.currentMonth}' AND
7272
client = 'mobile'
7373
${constants.devRankFilter}
74+
`).postOps(ctx => `
75+
CREATE TEMP TABLE technologies_cleaned AS (
76+
WITH wappalyzer AS (
77+
SELECT DISTINCT
78+
name AS technology,
79+
categories
80+
FROM ${ctx.ref('wappalyzer', 'technologies')}
81+
),
82+
83+
pages AS (
84+
SELECT
85+
client,
86+
page,
87+
tech.technology,
88+
tech.categories,
89+
tech.info
90+
FROM ${ctx.self()} AS pages
91+
LEFT JOIN pages.technologies AS tech
92+
WHERE date = '${constants.currentMonth}' ${constants.devRankFilter}
93+
),
94+
95+
-- Identify impacted pages
96+
impacted_pages AS (
97+
SELECT DISTINCT
98+
client,
99+
page
100+
FROM pages
101+
LEFT JOIN pages.categories AS category
102+
WHERE
103+
-- Technology is corrupted
104+
technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR
105+
-- Technology's category is corrupted
106+
CONCAT(technology, category) NOT IN (
107+
SELECT DISTINCT
108+
CONCAT(technology, category)
109+
FROM wappalyzer
110+
LEFT JOIN wappalyzer.categories AS category
111+
)
112+
),
113+
114+
-- Keep valid technologies and use correct categories
115+
reconstructed_technologies AS (
116+
SELECT
117+
client,
118+
page,
119+
ARRAY_AGG(STRUCT(
120+
pages.technology,
121+
wappalyzer.categories,
122+
pages.info
123+
)) AS technologies
124+
FROM pages
125+
INNER JOIN impacted_pages
126+
USING (client, page)
127+
INNER JOIN wappalyzer
128+
ON pages.technology = wappalyzer.technology
129+
GROUP BY
130+
client,
131+
page
132+
)
133+
134+
SELECT
135+
client,
136+
page,
137+
technologies
138+
FROM reconstructed_technologies
139+
);
140+
141+
-- Update the crawl.pages table with the cleaned and restored technologies
142+
UPDATE ${ctx.self()} AS pages
143+
SET technologies = technologies_cleaned.technologies
144+
FROM technologies_cleaned
145+
WHERE pages.date = '${constants.currentMonth}' AND
146+
pages.client = technologies_cleaned.client AND
147+
pages.page = technologies_cleaned.page;
74148
`)

definitions/output/reports/cwv_tech_categories.js

+38-22
Original file line numberDiff line numberDiff line change
@@ -7,53 +7,69 @@ publish('cwv_tech_categories', {
77
}).query(ctx => `
88
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */
99
WITH pages AS (
10-
SELECT
10+
SELECT DISTINCT
11+
client,
1112
root_page,
1213
technologies
1314
FROM ${ctx.ref('crawl', 'pages')}
1415
WHERE
15-
date = '${pastMonth}' AND
16-
client = 'mobile'
16+
date = '${pastMonth}'
1717
${constants.devRankFilter}
18-
), categories AS (
18+
),
19+
20+
category_descriptions AS (
1921
SELECT
2022
name AS category,
2123
description
2224
FROM ${ctx.ref('wappalyzer', 'categories')}
23-
), category_stats AS (
25+
),
26+
27+
category_stats AS (
2428
SELECT
2529
category,
26-
COUNT(DISTINCT root_page) AS origins
27-
FROM pages,
28-
UNNEST(technologies) AS t,
29-
UNNEST(t.categories) AS category
30+
STRUCT(
31+
COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop,
32+
COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile
33+
) AS origins
34+
FROM (
35+
SELECT
36+
client,
37+
category,
38+
COUNT(DISTINCT root_page) AS origins
39+
FROM pages
40+
LEFT JOIN pages.technologies AS tech
41+
LEFT JOIN tech.categories AS category
42+
GROUP BY
43+
client,
44+
category
45+
)
3046
GROUP BY category
31-
), technology_stats AS (
47+
),
48+
49+
technology_stats AS (
3250
SELECT
33-
category,
3451
technology,
35-
COUNT(DISTINCT root_page) AS origins
36-
FROM pages,
37-
UNNEST(technologies) AS t,
38-
UNNEST(t.categories) AS category
52+
category_obj AS categories,
53+
SUM(origins) AS total_origins
54+
FROM ${ctx.ref('reports', 'cwv_tech_technologies')}
3955
GROUP BY
40-
category,
41-
technology
56+
technology,
57+
categories
4258
)
4359
4460
SELECT
4561
category,
4662
description,
47-
category_stats.origins,
48-
ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.origins DESC) AS technologies
63+
origins,
64+
ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.total_origins DESC) AS technologies
4965
FROM category_stats
5066
INNER JOIN technology_stats
51-
USING (category)
52-
LEFT JOIN categories
67+
ON category_stats.category IN UNNEST(technology_stats.categories)
68+
INNER JOIN category_descriptions
5369
USING (category)
5470
GROUP BY
5571
category,
5672
description,
5773
origins
58-
ORDER BY origins DESC
74+
ORDER BY category ASC
5975
`)

definitions/output/reports/cwv_tech_technologies.js

+63-9
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,73 @@ publish('cwv_tech_technologies', {
66
tags: ['crux_ready']
77
}).query(ctx => `
88
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */
9+
WITH pages AS (
10+
SELECT DISTINCT
11+
client,
12+
root_page,
13+
tech.technology
14+
FROM ${ctx.ref('crawl', 'pages')},
15+
UNNEST(technologies) AS tech
16+
WHERE
17+
date = '${pastMonth}'
18+
${constants.devRankFilter}
19+
),
20+
21+
tech_origins AS (
22+
SELECT
23+
client,
24+
technology,
25+
COUNT(DISTINCT root_page) AS origins
26+
FROM pages
27+
GROUP BY
28+
client,
29+
technology
30+
),
31+
32+
technologies AS (
33+
SELECT
34+
name AS technology,
35+
description,
36+
STRING_AGG(DISTINCT category, ', ' ORDER BY category ASC) AS category,
37+
categories AS category_obj,
38+
NULL AS similar_technologies
39+
FROM ${ctx.ref('wappalyzer', 'technologies')},
40+
UNNEST(categories) AS category
41+
GROUP BY
42+
technology,
43+
description,
44+
categories
45+
),
46+
47+
total_pages AS (
48+
SELECT
49+
client,
50+
COUNT(DISTINCT root_page) AS origins
51+
FROM pages
52+
GROUP BY client
53+
)
54+
955
SELECT
1056
client,
11-
app AS technology,
57+
technology,
1258
description,
1359
category,
14-
SPLIT(category, ",") AS category_obj,
60+
category_obj,
61+
similar_technologies,
62+
origins
63+
FROM tech_origins
64+
INNER JOIN technologies
65+
USING(technology)
66+
67+
UNION ALL
68+
69+
SELECT
70+
client,
71+
'ALL' AS technology,
72+
NULL AS description,
73+
NULL AS category,
74+
NULL AS category_obj,
1575
NULL AS similar_technologies,
1676
origins
17-
FROM ${ctx.ref('core_web_vitals', 'technologies')}
18-
LEFT JOIN ${ctx.ref('wappalyzer', 'technologies')}
19-
ON app = name
20-
WHERE date = '${pastMonth}' AND
21-
geo = 'ALL' AND
22-
rank = 'ALL'
23-
ORDER BY origins DESC
77+
FROM total_pages
2478
`)

0 commit comments

Comments
 (0)