Modular book product table SQL

The-Academic-Observatory · Dec 13, 2023 · c4b65be · c4b65be
1 parent 5e6e417
commit c4b65be
Show file tree

Hide file tree

Showing 21 changed files with 1,033 additions and 1,055 deletions.
diff --git a/oaebu_workflows/google_analytics3_telescope/sql/bp_body_google_analytics3.sql.jinja2 b/oaebu_workflows/google_analytics3_telescope/sql/bp_body_google_analytics3.sql.jinja2
@@ -0,0 +1,172 @@
+-- The purpose of this block of SQL is to organise the metrics from google analytics for easier consumption of downstream queries.
+-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_analytics3_table_id'.
+-- This will either point to 'empty_google_analytics3' (defined above as an empty row) or the name of the real data table in bigquery.
+-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+google_analytics3_grouped_metrics AS(
+    SELECT
+        publication_id,
+        release_date,
+        publication_whole_or_part,
+        publication_format,
+        IF (
+            publication_format = 'PDF'
+            AND publication_whole_or_part = 'whole',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS pdf_book_country,
+        IF (
+            publication_format = 'PDF'
+            AND publication_whole_or_part = 'part',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS pdf_chapter_country,
+        IF (
+            publication_format = 'HTML'
+            AND publication_whole_or_part = 'whole',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS html_book_country,
+        IF (
+            publication_format = 'HTML'
+            AND publication_whole_or_part = 'part',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS html_chapter_country,
+        IF (
+            publication_format = 'EPUB'
+            AND publication_whole_or_part = 'whole',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS epub_book_country,
+        IF (
+            publication_format = 'EPUB'
+            AND publication_whole_or_part = 'part',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS epub_chapter_country,
+        IF (
+            publication_format = 'MOBI'
+            AND publication_whole_or_part = 'whole',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS mobi_book_country,
+        IF (
+            publication_format = 'MOBI'
+            AND publication_whole_or_part = 'part',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS mobi_chapter_country,
+        IF (
+            publication_format IN ('PDF', 'HTML', 'EPUB', 'MOBI')
+            AND publication_whole_or_part IN ('whole', 'part'),
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS downloads_total_country,
+        IF (
+            publication_whole_or_part = '(citation)',
+            group_items_google_analytics3(
+                ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
+            ),
+            ARRAY_AGG(
+                STRUCT(
+                    CAST(NULL as STRING) as name,
+                    CAST(null as INT64) as value
+                )
+            )
+        ) AS views_total_country,
+    FROM
+        `{{ google_analytics3_table_id }}`
+    WHERE
+        publication_type = "book"
+    GROUP BY
+        publication_id,
+        release_date,
+        publication_whole_or_part,
+        publication_format
+),
+google_analytics3_metrics AS (
+    SELECT
+        publication_id AS ISBN13,
+        release_date,
+        STRUCT(
+            ARRAY_CONCAT_AGG(views_total_country) AS views_total_country,
+            group_items_google_analytics3(ARRAY_CONCAT_AGG(downloads_total_country)) AS downloads_total_country,
+            ARRAY_CONCAT_AGG(pdf_book_country) AS downloads_pdf_book_country,
+            ARRAY_CONCAT_AGG(pdf_chapter_country) AS downloads_pdf_chapter_country,
+            ARRAY_CONCAT_AGG(html_book_country) AS downloads_html_book_country,
+            ARRAY_CONCAT_AGG(html_chapter_country) AS downloads_html_chapter_country,
+            ARRAY_CONCAT_AGG(epub_book_country) AS downloads_epub_book_country,
+            ARRAY_CONCAT_AGG(epub_chapter_country) AS downloads_epub_chapter_country,
+            ARRAY_CONCAT_AGG(mobi_book_country) AS downloads_mobi_book_country,
+            ARRAY_CONCAT_AGG(mobi_chapter_country) AS downloads_mobi_chapter_country
+        ) AS metrics
+    FROM
+        google_analytics3_grouped_metrics
+    GROUP BY
+        publication_id,
+        release_date
+)
diff --git a/oaebu_workflows/google_analytics3_telescope/sql/bp_functions_google_analytics3.sql b/oaebu_workflows/google_analytics3_telescope/sql/bp_functions_google_analytics3.sql
@@ -0,0 +1,16 @@
+-- Output Schema:
+-- name      STRING    NULLABLE
+-- value     INTEGER   NULLABLE
+CREATE TEMP FUNCTION group_items_google_analytics3(items ARRAY < STRUCT < name STRING, value INT64 > >) as (
+    ARRAY(
+        (
+            SELECT
+                AS STRUCT name,
+                SUM(value) as value,
+            FROM
+                UNNEST(items)
+            GROUP BY
+                name
+        )
+    )
+);
diff --git a/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_sales.sql.jinja2 b/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_sales.sql.jinja2
@@ -0,0 +1,35 @@
+-- The purpose of this block of SQL is to organise the metrics from google book sales for easier consumption of downstream queries.
+-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
+-- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
+-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+google_books_sales_metrics as (
+    SELECT
+        Primary_ISBN as ISBN13,
+        release_date,
+        STRUCT(
+            SUM(qty) as qty,
+            group_items_google_books_sales(ARRAY_AGG(STRUCT(Country_of_Sale, qty))) as countries
+        ) as metrics
+    FROM
+        `{{ google_books_sales_table_id }}`
+    GROUP BY
+        Primary_ISBN,
+        release_date
+),
+-- The purpose of this block of SQL is to organise the Metadata from google book sales for easier consumption of downstream queries.
+-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
+-- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
+-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+google_books_sales_metadata as (
+    SELECT
+        Primary_ISBN as ISBN13,
+        MAX(Imprint_Name) as Imprint_Name,
+        MAX(Title) as Title,
+        MAX(Author) as Author
+    FROM
+        `{{ google_books_sales_table_id }}`
+    GROUP BY
+        Primary_ISBN
+)
diff --git a/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_traffic.sql.jinja2 b/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_traffic.sql.jinja2
@@ -0,0 +1,37 @@
+-- The purpose of this block of SQL is to organise the metrics from google book traffic for easier consumption of downstream queries.
+-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
+-- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery.
+-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+google_books_traffic_metrics as (
+    SELECT
+        Primary_ISBN as ISBN13,
+        release_date,
+        STRUCT(
+            SUM(Book_Visits_BV_) as Book_Visits_BV_,
+            SUM(BV_with_Pages_Viewed) as BV_with_Pages_Viewed,
+            SUM(Non_Unique_Buy_Clicks) as Non_Unique_Buy_Clicks,
+            SUM(BV_with_Buy_Clicks) as BV_with_Buy_Clicks,
+            SUM(Buy_Link_CTR) as Buy_Link_CTR,
+            SUM(Pages_Viewed) as Pages_Viewed
+        ) as metrics
+    FROM
+        `{{ google_books_traffic_table_id }}`
+    GROUP BY
+        Primary_ISBN,
+        release_date
+),
+-- The purpose of this block of SQL is to organise the Metadata from google book traffic for easier consumption of downstream queries.
+-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
+-- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery.
+-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+google_books_traffic_metadata as (
+    SELECT
+        Primary_ISBN as ISBN13,
+        MAX(title) as Title
+    FROM
+        `{{ google_books_traffic_table_id }}`
+    GROUP BY
+        Primary_ISBN
+)
diff --git a/oaebu_workflows/google_books_telescope/sql/bp_functions_google_books_sales.sql b/oaebu_workflows/google_books_telescope/sql/bp_functions_google_books_sales.sql
@@ -0,0 +1,19 @@
+-- Output Schema:
+-- Country_of_Sale      STRING    NULLABLE
+-- qty                  INTEGER   NULLABLE
+CREATE TEMP FUNCTION group_items_google_books_sales(
+    items ARRAY < STRUCT < Country_of_Sale STRING,
+    qty INT64 > >
+) as (
+    ARRAY(
+        (
+            SELECT
+                AS STRUCT Country_of_Sale,
+                SUM(qty) as qty,
+            FROM
+                UNNEST(items)
+            GROUP BY
+                Country_of_Sale
+        )
+    )
+);
diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/bp_body_irus_fulcrum.sql.jinja2 b/oaebu_workflows/irus_fulcrum_telescope/sql/bp_body_irus_fulcrum.sql.jinja2
@@ -0,0 +1,37 @@
+-- The purpose of this block of SQL is to organise the metrics from IRUS Fulcrum for easier consumption of downstream queries.
+-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
+-- This will either point to 'empty_irus_fulcrum' (defined above as an empty row) or the name of the real data table in bigquery.
+-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+irus_fulcrum_metrics as (
+    SELECT
+        ISBN as ISBN13,
+        release_date,
+        STRUCT(
+            SUM(total_item_investigations) as total_item_investigations,
+            SUM(total_item_requests) as total_item_requests,
+            SUM(unique_item_investigations) as unique_item_investigations,
+            SUM(unique_item_requests) as unique_item_requests,
+            group_items_irus_fulcrum_country(ARRAY_CONCAT_AGG(country)) as country
+        ) as metrics
+    FROM
+        `{{ irus_fulcrum_table_id }}`
+    GROUP BY
+        ISBN,
+        release_date
+),
+-- The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries.
+-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
+-- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery.
+-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
+-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
+irus_fulcrum_metadata as (
+    SELECT
+        ISBN as ISBN13,
+        MAX(book_title) as book_title,
+        MAX(publisher) as publisher
+    FROM
+        `{{ irus_fulcrum_table_id }}`
+    GROUP BY
+        ISBN
+)
diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/bp_functions_irus_fulcrum.sql b/oaebu_workflows/irus_fulcrum_telescope/sql/bp_functions_irus_fulcrum.sql
@@ -0,0 +1,31 @@
+-- Output Schema:
+-- name                            STRING    NULLABLE
+-- code                            STRING    NULLABLE
+-- total_item_investigations       INTEGER   NULLABLE
+-- total_item_requests             INTEGER   NULLABLE
+-- unique_item_investigations      INTEGER   NULLABLE
+-- unique_item_requests            INTEGER   NULLABLE
+CREATE TEMP FUNCTION group_items_irus_fulcrum_country(
+    items ARRAY < STRUCT < name STRING,
+    code STRING,
+    total_item_investigations INT64,
+    total_item_requests INT64,
+    unique_item_investigations INT64,
+    unique_item_requests INT64 > >
+) as (
+    ARRAY(
+        (
+            SELECT
+                AS STRUCT name,
+                MAX(code) as code,
+                SUM(total_item_investigations) as total_item_investigations,
+                SUM(total_item_requests) as total_item_requests,
+                SUM(unique_item_investigations) as unique_item_investigations,
+                SUM(unique_item_requests) as unique_item_requests
+            FROM
+                UNNEST(items)
+            GROUP BY
+                name
+        )
+    )
+);