First pass modular SQL

The-Academic-Observatory · Dec 18, 2023 · 89b1bad · 89b1bad
1 parent 3305b70
commit 89b1bad
Show file tree

Hide file tree

Showing 40 changed files with 231 additions and 304 deletions.
diff --git a/..._workflows/google_analytics3_telescope/sql/book_product_body_google_analytics3.sql.jinja2 b/..._workflows/google_analytics3_telescope/sql/book_product_body_google_analytics3.sql.jinja2
@@ -1,8 +1,5 @@
--- The purpose of this block of SQL is to organise the metrics from google analytics for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_analytics3_table_id'.
--- This will either point to 'empty_google_analytics3' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+# The purpose of this block of SQL is to organise the metrics from google analytics for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_analytics3_table_id'.
 google_analytics3_grouped_metrics AS(
     SELECT
         publication_id,
@@ -148,6 +145,7 @@ google_analytics3_grouped_metrics AS(
         publication_whole_or_part,
         publication_format
 ),
+
 google_analytics3_metrics AS (
     SELECT
         publication_id AS ISBN13,

diff --git a/oaebu_workflows/google_analytics3_telescope/sql/export_author_struct_google_analytics3.sql b/oaebu_workflows/google_analytics3_telescope/sql/export_author_struct_google_analytics3.sql
@@ -35,4 +35,4 @@ STRUCT(
             month.google_analytics.downloads_mobi_book_country
         )
     ) as downloads_mobi_book
-) as google_analytics,
+) as google_analytics
diff --git a/...orkflows/google_analytics3_telescope/sql/export_country_body_google_analytics3.sql.jinja2 b/...orkflows/google_analytics3_telescope/sql/export_country_body_google_analytics3.sql.jinja2
@@ -157,4 +157,4 @@ google_analytics_downloads_mobi_book_month_country as (
                 UNNEST(google_analytics.downloads_mobi_book_country)
         ) as google
         LEFT JOIN `{{ country_table_id }}` as country on country.google_analytics_name = google.country_name
-),
+)
diff --git a/oaebu_workflows/google_analytics3_telescope/sql/month_metrics_sum_google_analytics3.sql b/oaebu_workflows/google_analytics3_telescope/sql/month_metrics_sum_google_analytics3.sql
@@ -0,0 +1,38 @@
+STRUCT(
+    group_counts(
+        ARRAY_CONCAT_AGG(month.google_analytics.views_total_country)
+    ) as page_views,
+    group_counts(
+        ARRAY_CONCAT_AGG(month.google_analytics.downloads_total_country)
+    ) as downloads,
+    group_counts(
+        ARRAY_CONCAT_AGG(
+            month.google_analytics.downloads_pdf_book_country
+        )
+    ) as downloads_pdf_book,
+    group_counts(
+        ARRAY_CONCAT_AGG(
+            month.google_analytics.downloads_pdf_chapter_country
+        )
+    ) as downloads_pdf_chapter,
+    group_counts(
+        ARRAY_CONCAT_AGG(
+            month.google_analytics.downloads_html_chapter_country
+        )
+    ) as downloads_html_chapter,
+    group_counts(
+        ARRAY_CONCAT_AGG(
+            month.google_analytics.downloads_epub_book_country
+        )
+    ) as downloads_epub_book,
+    group_counts(
+        ARRAY_CONCAT_AGG(
+            month.google_analytics.downloads_epub_chapter_country
+        )
+    ) as downloads_epub_chapter,
+    group_counts(
+        ARRAY_CONCAT_AGG(
+            month.google_analytics.downloads_mobi_book_country
+        )
+    ) as downloads_mobi_book
+) as google_analytics
diff --git a/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_sales.sql.jinja2 b/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_sales.sql.jinja2
@@ -1,8 +1,8 @@
--- The purpose of this block of SQL is to organise the metrics from google book sales for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
--- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+# The purpose of this block of SQL is to organise the metrics from google book sales for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
+# This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
+# The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+# Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
 google_books_sales_metrics as (
     SELECT
         Primary_ISBN as ISBN13,
@@ -17,11 +17,12 @@ google_books_sales_metrics as (
         Primary_ISBN,
         release_date
 ),
--- The purpose of this block of SQL is to organise the Metadata from google book sales for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
--- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+
+# The purpose of this block of SQL is to organise the Metadata from google book sales for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
+# This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
+# The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+# Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
 google_books_sales_metadata as (
     SELECT
         Primary_ISBN as ISBN13,

diff --git a/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_traffic.sql.jinja2 b/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_traffic.sql.jinja2
@@ -1,8 +1,5 @@
--- The purpose of this block of SQL is to organise the metrics from google book traffic for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
--- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+# The purpose of this block of SQL is to organise the metrics from google book traffic for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
 google_books_traffic_metrics as (
     SELECT
         Primary_ISBN as ISBN13,
@@ -21,11 +18,9 @@ google_books_traffic_metrics as (
         Primary_ISBN,
         release_date
 ),
--- The purpose of this block of SQL is to organise the Metadata from google book traffic for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
--- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+
+# The purpose of this block of SQL is to organise the Metadata from google book traffic for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
 google_books_traffic_metadata as (
     SELECT
         Primary_ISBN as ISBN13,
@@ -34,4 +29,4 @@ google_books_traffic_metadata as (
         `{{ google_books_traffic_table_id }}`
     GROUP BY
         Primary_ISBN
-),
+)
diff --git a/oaebu_workflows/google_books_telescope/sql/export_country_body_google_books_sales.sql b/oaebu_workflows/google_books_telescope/sql/export_country_body_google_books_sales.sql
@@ -7,4 +7,4 @@ google_books_month_country as (
     FROM
         months,
         UNNEST(google_books_sales.countries)
-),
+)
diff --git a/...port_author_struct_google_books_sales.sql → .../month_metrics_sum_google_books_sales.sql b/...port_author_struct_google_books_sales.sql → .../month_metrics_sum_google_books_sales.sql
@@ -1,3 +1,3 @@
 STRUCT(
     SUM(month.google_books_sales.qty) as qty
-) as google_books_sales,
+) as google_books_sales
diff --git a/...rt_author_struct_google_books_traffic.sql → ...onth_metrics_sum_google_books_traffic.sql b/...rt_author_struct_google_books_traffic.sql → ...onth_metrics_sum_google_books_traffic.sql
@@ -5,4 +5,4 @@ STRUCT(
     SUM(month.google_books_traffic.BV_with_Buy_Clicks) as BV_with_Buy_Clicks,
     SUM(month.google_books_traffic.Buy_Link_CTR) as Buy_Link_CTR,
     SUM(month.google_books_traffic.Pages_Viewed) as Pages_Viewed
-) as google_books_traffic,
+) as google_books_traffic
diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_body_irus_fulcrum.sql.jinja2 b/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_body_irus_fulcrum.sql.jinja2
@@ -1,8 +1,5 @@
--- The purpose of this block of SQL is to organise the metrics from IRUS Fulcrum for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
--- This will either point to 'empty_irus_fulcrum' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+# The purpose of this block of SQL is to organise the metrics from IRUS Fulcrum for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
 irus_fulcrum_metrics as (
     SELECT
         ISBN as ISBN13,
@@ -20,11 +17,9 @@ irus_fulcrum_metrics as (
         ISBN,
         release_date
 ),
--- The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
--- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+
+# The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
 irus_fulcrum_metadata as (
     SELECT
         ISBN as ISBN13,

diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/export_author_null_irus_fulcrum.sql b/oaebu_workflows/irus_fulcrum_telescope/sql/export_author_null_irus_fulcrum.sql
diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/export_book_metrics_irus_fulcrum.sql b/oaebu_workflows/irus_fulcrum_telescope/sql/export_book_metrics_irus_fulcrum.sql
@@ -3,4 +3,4 @@ STRUCT(
     month.irus_fulcrum.total_item_requests,
     month.irus_fulcrum.unique_item_investigations,
     month.irus_fulcrum.unique_item_requests
-) AS irus_fulcrum,
+) AS irus_fulcrum
diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/export_country_body_irus_fulcrum.sql b/oaebu_workflows/irus_fulcrum_telescope/sql/export_country_body_irus_fulcrum.sql
@@ -10,4 +10,4 @@ irus_fulcrum_month_country as (
     FROM
         months,
         UNNEST(irus_fulcrum.country)
-),
+)
diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/export_country_null_irus_fulcrum.sql b/oaebu_workflows/irus_fulcrum_telescope/sql/export_country_null_irus_fulcrum.sql
@@ -0,0 +1 @@
+irus_fulcrum.total_item_requests IS NOT NULL
diff --git a/...sql/export_author_struct_irus_fulcrum.sql → ...pe/sql/month_metrics_sum_irus_fulcrum.sql b/...sql/export_author_struct_irus_fulcrum.sql → ...pe/sql/month_metrics_sum_irus_fulcrum.sql
@@ -3,4 +3,4 @@ STRUCT(
     SUM(month.irus_fulcrum.total_item_requests) as total_item_requests,
     SUM(month.irus_fulcrum.unique_item_investigations) as unique_item_investigations,
     SUM(month.irus_fulcrum.unique_item_requests) as unique_item_requests
-) as irus_fulcrum,
+) as irus_fulcrum
diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/month_null_irus_fulcrum.sql b/oaebu_workflows/irus_fulcrum_telescope/sql/month_null_irus_fulcrum.sql
@@ -1 +1 @@
-irus_fulcrum.total_item_requests IS NOT NULL
+month.irus_fulcrum IS NOT NULL
diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_product_body_irus_oapen.sql.jinja2 b/oaebu_workflows/irus_oapen_telescope/sql/book_product_body_irus_oapen.sql.jinja2
@@ -1,8 +1,5 @@
--- The purpose of this block of SQL is to organise the metrics from IRUS OAPEN for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'oapen_table_id'.
--- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+# The purpose of this block of SQL is to organise the metrics from IRUS OAPEN for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'oapen_table_id'.
 irus_oapen_metrics as (
     SELECT
         ISBN as ISBN13,
@@ -23,11 +20,9 @@ irus_oapen_metrics as (
         ISBN,
         release_date
 ),
--- The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_oapen_table_id'.
--- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+
+# The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_oapen_table_id'.
 irus_oapen_metadata as (
     SELECT
         ISBN as ISBN13,
@@ -37,4 +32,4 @@ irus_oapen_metadata as (
         `{{ irus_oapen_table_id }}`
     GROUP BY
         ISBN
-),
+)
diff --git a/oaebu_workflows/irus_oapen_telescope/sql/export_author_null_irus_oapen.sql b/oaebu_workflows/irus_oapen_telescope/sql/export_author_null_irus_oapen.sql
diff --git a/oaebu_workflows/irus_oapen_telescope/sql/export_city_irus_oapen.sql b/oaebu_workflows/irus_oapen_telescope/sql/export_city_irus_oapen.sql
diff --git a/oaebu_workflows/irus_oapen_telescope/sql/export_country_body_irus_oapen.sql b/oaebu_workflows/irus_oapen_telescope/sql/export_country_body_irus_oapen.sql
@@ -11,4 +11,4 @@ irus_oapen_month_country as (
     FROM
         months,
         UNNEST(irus_oapen.country)
-),
+)
diff --git a/oaebu_workflows/irus_oapen_telescope/sql/export_country_null_irus_oapen.sql b/oaebu_workflows/irus_oapen_telescope/sql/export_country_null_irus_oapen.sql
@@ -0,0 +1,2 @@
+irus_oapen.title_requests IS NOT NULL
+OR irus_oapen.total_item_requests IS NOT NULL
diff --git a/...e/sql/export_author_struct_irus_oapen.sql → ...cope/sql/month_metrics_sum_irus_oapen.sql b/...e/sql/export_author_struct_irus_oapen.sql → ...cope/sql/month_metrics_sum_irus_oapen.sql
@@ -4,4 +4,4 @@ STRUCT(
     SUM(month.irus_oapen.total_item_requests) as total_item_requests,
     SUM(month.irus_oapen.unique_item_investigations) as unique_item_investigations,
     SUM(month.irus_oapen.unique_item_requests) as unique_item_requests
-) as irus_oapen,
+) as irus_oapen
diff --git a/oaebu_workflows/irus_oapen_telescope/sql/month_null_irus_oapen.sql b/oaebu_workflows/irus_oapen_telescope/sql/month_null_irus_oapen.sql
@@ -1,2 +1 @@
-irus_oapen.title_requests IS NOT NULL
-OR irus_oapen.total_item_requests IS NOT NULL
+month.irus_oapen IS NOT NULL
diff --git a/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_country.sql.jinja2 b/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_country.sql.jinja2
@@ -1,8 +1,5 @@
--- The purpose of this block of SQL is to organise the metrics from JSTOR country for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'.
--- This will either point to 'empty_jstor_country' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+# The purpose of this block of SQL is to organise the metrics from JSTOR country for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'.
 jstor_country_metrics as (
     SELECT
         eISBN as ISBN13,
@@ -16,11 +13,9 @@ jstor_country_metrics as (
         eISBN,
         release_date
 ),
--- The purpose of this block of SQL is to organise the Metadata from JSTOR country for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'.
--- This will either point to 'empty_jstor_country' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+
+# The purpose of this block of SQL is to organise the Metadata from JSTOR country for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'.
 jstor_country_metadata as (
     SELECT
         eISBN as ISBN13,
@@ -36,4 +31,4 @@ jstor_country_metadata as (
         `{{ jstor_country_table_id }}`
     GROUP BY
         eISBN
-),
+)
diff --git a/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_institution.sql.jinja2 b/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_institution.sql.jinja2
@@ -1,8 +1,5 @@
--- The purpose of this block of SQL is to organise the metrics from JSTOR institution for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'.
--- This will either point to 'empty_jstor_institution' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+# The purpose of this block of SQL is to organise the metrics from JSTOR institution for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'.
 jstor_institution_metrics as (
     SELECT
         eISBN as ISBN13,
@@ -16,11 +13,9 @@ jstor_institution_metrics as (
         eISBN,
         release_date
 ),
--- The purpose of this block of SQL is to organise the Metadata from JSTOR institution for easier consumption of downstream queries.
--- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'.
--- This will either point to 'empty_jstor_institution' (defined above as an empty row) or the name of the real data table in bigquery.
--- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
--- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+
+# The purpose of this block of SQL is to organise the Metadata from JSTOR institution for easier consumption of downstream queries.
+# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'.
 jstor_institution_metadata as (
     SELECT
         eISBN as ISBN13,
@@ -36,4 +31,4 @@ jstor_institution_metadata as (
         `{{ jstor_institution_table_id }}`
     GROUP BY
         eISBN
-),
+)
diff --git a/...tor_telescope/sql/export_book_metrics.sql → ...sql/export_book_metrics_jstor_country.sql b/...tor_telescope/sql/export_book_metrics.sql → ...sql/export_book_metrics_jstor_country.sql
@@ -5,4 +5,4 @@ STRUCT(
         FROM
             UNNEST(month.jstor_country)
     ) AS Total_Item_Requests
-) AS jstor,
+) AS jstor
diff --git a/oaebu_workflows/jstor_telescope/sql/export_country_body_jstor_country.sql b/oaebu_workflows/jstor_telescope/sql/export_country_body_jstor_country.sql
@@ -8,4 +8,4 @@ jstor_month_country as (
     FROM
         months,
         UNNEST(jstor_country)
-),
+)
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,4 @@ google_books_month_country as ( @@
         FROM
             months,
             UNNEST(google_books_sales.countries)
-    ),
+    )
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		irus_fulcrum.total_item_requests IS NOT NULL
		month.irus_fulcrum IS NOT NULL
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		irus_oapen.title_requests IS NOT NULL
		OR irus_oapen.total_item_requests IS NOT NULL