diff --git a/definitions/output/wappalyzer/tech_detections.js b/definitions/output/wappalyzer/tech_detections.js index 93d5428..4a2a0d8 100644 --- a/definitions/output/wappalyzer/tech_detections.js +++ b/definitions/output/wappalyzer/tech_detections.js @@ -1,127 +1,144 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth) publish('tech_detections', { - type: 'table', - description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/page/p_an38lbzywc/edit', + description: 'Used in dashboard: https://lookerstudio.google.com/u/7/reporting/1jh_ScPlCIbSYTf2r2Y6EftqmX9SQy4Gn/origin/p_an38lbzywc/edit', schema: 'wappalyzer', + type: 'incremental', + protected: true, + bigquery: { + partitionBy: 'date' + }, tags: ['crawl_complete'] -}).query(ctx => ` +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}'; +`).query(ctx => ` WITH source AS ( SELECT DISTINCT date, - root_page AS page, + root_page AS origin, tech.technology FROM ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS tech - WHERE date >= "${pastMonth}" ${constants.devRankFilter} + WHERE date IN ('${pastMonth}', '${constants.currentMonth}') ${constants.devRankFilter} ), --- Technology in the previous month (August) +-- Technology in the previous month tech_before AS ( SELECT - page, + origin, technology FROM source - WHERE date = "${pastMonth}" + WHERE date = '${pastMonth}' ), --- Technology in the current month (September) +-- Technology in the current month tech_current AS ( SELECT - page, + origin, technology FROM source - WHERE date = "${constants.currentMonth}" + WHERE date = '${constants.currentMonth}' ), --- Summary of technology and categories per page in the previous month +-- Summary of technology per origin in the previous month tech_before_summary AS ( SELECT technology, - COUNT(DISTINCT page) AS total_pages_before + COUNT(DISTINCT origin) AS total_origins_before FROM tech_before GROUP BY technology ), --- Pages that existed last month but introduced the technology in the current month -tech_introduced_existing_pages AS ( +-- origins that persisted across both months and adopted the technology in the current month +tech_adopted_existing_origins AS ( SELECT - tech_current.technology, - COUNT(DISTINCT tech_current.page) AS total_pages_introduced_existing, - STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_existing - FROM tech_current - JOIN tech_before - USING (page) + persisted_origins.technology, + COUNT(DISTINCT persisted_origins.origin) AS total_origins_adopted_existing, + STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_adopted_existing + FROM ( + SELECT DISTINCT + tech_current.technology, + tech_current.origin + FROM tech_before + JOIN tech_current + USING (origin) + ) as persisted_origins LEFT JOIN tech_before AS tb - ON tech_current.page = tb.page AND tech_current.technology = tb.technology - WHERE tb.page IS NULL -- Technology was not detected last month - GROUP BY tech_current.technology + ON persisted_origins.origin = tb.origin AND persisted_origins.technology = tb.technology + WHERE tb.origin IS NULL -- Technology was not detected last month + GROUP BY 1 ), --- Pages that were not in the dataset last month but appeared this month with the technology -tech_introduced_new_pages AS ( +-- origins that arrived to CrUX in the current month and their detected technologies +tech_adopted_new_origins AS ( SELECT tech_current.technology, - COUNT(DISTINCT tech_current.page) AS total_pages_introduced_new, - STRING_AGG(DISTINCT tech_current.page LIMIT 5) AS sample_pages_introduced_new + COUNT(DISTINCT tech_current.origin) AS total_origins_adopted_new, + --STRING_AGG(DISTINCT tech_current.origin LIMIT 5) AS sample_origins_adopted_new FROM tech_current LEFT JOIN tech_before - USING (page) - WHERE tech_before.page IS NULL -- Page was not present last month - GROUP BY tech_current.technology + USING (origin) + WHERE tech_before.origin IS NULL -- origin was not present last month + GROUP BY 1 ), --- Pages that existed this month but no longer have the technology -tech_deprecated_existing_pages AS ( +-- origins that persisted across both months and deprecated the technology usage in the current month +tech_deprecated_existing_origins AS ( SELECT - tech_before.technology, - COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_existing, - STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_existing - FROM tech_before - JOIN tech_current - USING (page) + persisted_origins.technology, + COUNT(DISTINCT persisted_origins.origin) AS total_origins_deprecated_existing, + STRING_AGG(DISTINCT persisted_origins.origin LIMIT 5) AS sample_origins_deprecated_existing + FROM ( + SELECT DISTINCT + tech_before.technology, + tech_before.origin + FROM tech_before + JOIN tech_current + USING (origin) + ) as persisted_origins LEFT JOIN tech_current AS tc - ON tech_before.page = tc.page AND tech_before.technology = tc.technology - WHERE tc.page IS NULL -- Technology is not detected in the current month - GROUP BY tech_before.technology + ON persisted_origins.origin = tc.origin AND persisted_origins.technology = tc.technology + WHERE tc.origin IS NULL -- Technology is not detected in the current month + GROUP BY 1 ), --- Pages that no longer exist in the current dataset -tech_deprecated_gone_pages AS ( +-- origins that were dropped from CrUX in the current dataset, and thus the technology was not detected anymore +tech_deprecated_gone_origins AS ( SELECT tech_before.technology, - COUNT(DISTINCT tech_before.page) AS total_pages_deprecated_gone, - STRING_AGG(DISTINCT tech_before.page LIMIT 5) AS sample_pages_deprecated_gone + COUNT(DISTINCT tech_before.origin) AS total_origins_deprecated_gone, + --STRING_AGG(DISTINCT tech_before.origin LIMIT 5) AS sample_origins_deprecated_gone FROM tech_before LEFT JOIN tech_current - USING (page) - WHERE tech_current.page IS NULL -- Page no longer exists in current dataset - GROUP BY tech_before.technology + USING (origin) + WHERE tech_current.origin IS NULL -- origin no longer exists in current dataset + GROUP BY 1 ) --- Final aggregation and comparison of technology adoption/deprecation metrics +-- aggregation of technology adoption/deprecation metrics SELECT - COALESCE(before_summary.technology, tech_introduced_existing_pages.technology, tech_introduced_new_pages.technology, apps.name) AS technology, + DATE('${constants.currentMonth}') AS date, + COALESCE(before_summary.technology, tech_adopted_existing_origins.technology, tech_adopted_new_origins.technology, apps.name) AS technology, - -- Pages summary - 0-COALESCE(total_pages_deprecated_existing, 0) AS total_pages_deprecated_existing, - 0-COALESCE(total_pages_deprecated_gone, 0) AS total_pages_deprecated_gone, + -- origins summary + 0-COALESCE(total_origins_deprecated_existing, 0) AS total_origins_deprecated_existing, + 0-COALESCE(total_origins_deprecated_gone, 0) AS total_origins_deprecated_gone, - COALESCE(total_pages_before, 0) - COALESCE(total_pages_deprecated_existing, 0) - COALESCE(total_pages_deprecated_gone, 0) AS total_pages_persisted, + COALESCE(total_origins_before, 0) - COALESCE(total_origins_deprecated_existing, 0) - COALESCE(total_origins_deprecated_gone, 0) AS total_origins_persisted, - COALESCE(total_pages_introduced_existing, 0) AS total_pages_introduced_existing, - COALESCE(total_pages_introduced_new, 0) AS total_pages_introduced_new, + COALESCE(total_origins_adopted_existing, 0) AS total_origins_adopted_existing, + COALESCE(total_origins_adopted_new, 0) AS total_origins_adopted_new, - -- Sample pages - COALESCE(sample_pages_deprecated_existing, "") AS sample_pages_deprecated_existing, - COALESCE(sample_pages_deprecated_gone, "") AS sample_pages_deprecated_gone, + -- Sample origins + COALESCE(sample_origins_deprecated_existing, "") AS sample_origins_deprecated_existing, + --COALESCE(sample_origins_deprecated_gone, "") AS sample_origins_deprecated_gone, - COALESCE(tech_introduced_existing_pages.sample_pages_introduced_existing, "") AS sample_pages_introduced_existing, - COALESCE(tech_introduced_new_pages.sample_pages_introduced_new, "") AS sample_pages_introduced_new + COALESCE(tech_adopted_existing_origins.sample_origins_adopted_existing, "") AS sample_origins_adopted_existing, + --COALESCE(tech_adopted_new_origins.sample_origins_adopted_new, "") AS sample_origins_adopted_new FROM tech_before_summary before_summary -FULL OUTER JOIN tech_introduced_existing_pages - ON before_summary.technology = tech_introduced_existing_pages.technology -FULL OUTER JOIN tech_introduced_new_pages - ON before_summary.technology = tech_introduced_new_pages.technology -LEFT JOIN tech_deprecated_existing_pages - ON before_summary.technology = tech_deprecated_existing_pages.technology -LEFT JOIN tech_deprecated_gone_pages - ON before_summary.technology = tech_deprecated_gone_pages.technology +FULL OUTER JOIN tech_adopted_existing_origins + ON before_summary.technology = tech_adopted_existing_origins.technology +FULL OUTER JOIN tech_adopted_new_origins + ON before_summary.technology = tech_adopted_new_origins.technology +LEFT JOIN tech_deprecated_existing_origins + ON before_summary.technology = tech_deprecated_existing_origins.technology +LEFT JOIN tech_deprecated_gone_origins + ON before_summary.technology = tech_deprecated_gone_origins.technology FULL OUTER JOIN wappalyzer.apps ON before_summary.technology = apps.name -ORDER BY total_pages_persisted DESC `)