Skip to content

Commit

Permalink
Modular book product table SQL
Browse files Browse the repository at this point in the history
  • Loading branch information
keegansmith21 committed Dec 13, 2023
1 parent 5e6e417 commit c4b65be
Show file tree
Hide file tree
Showing 21 changed files with 1,033 additions and 1,055 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
-- The purpose of this block of SQL is to organise the metrics from google analytics for easier consumption of downstream queries.
-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_analytics3_table_id'.
-- This will either point to 'empty_google_analytics3' (defined above as an empty row) or the name of the real data table in bigquery.
-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
google_analytics3_grouped_metrics AS(
SELECT
publication_id,
release_date,
publication_whole_or_part,
publication_format,
IF (
publication_format = 'PDF'
AND publication_whole_or_part = 'whole',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS pdf_book_country,
IF (
publication_format = 'PDF'
AND publication_whole_or_part = 'part',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS pdf_chapter_country,
IF (
publication_format = 'HTML'
AND publication_whole_or_part = 'whole',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS html_book_country,
IF (
publication_format = 'HTML'
AND publication_whole_or_part = 'part',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS html_chapter_country,
IF (
publication_format = 'EPUB'
AND publication_whole_or_part = 'whole',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS epub_book_country,
IF (
publication_format = 'EPUB'
AND publication_whole_or_part = 'part',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS epub_chapter_country,
IF (
publication_format = 'MOBI'
AND publication_whole_or_part = 'whole',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS mobi_book_country,
IF (
publication_format = 'MOBI'
AND publication_whole_or_part = 'part',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS mobi_chapter_country,
IF (
publication_format IN ('PDF', 'HTML', 'EPUB', 'MOBI')
AND publication_whole_or_part IN ('whole', 'part'),
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS downloads_total_country,
IF (
publication_whole_or_part = '(citation)',
group_items_google_analytics3(
ARRAY_CONCAT_AGG({ { ga3_views_field } }.country)
),
ARRAY_AGG(
STRUCT(
CAST(NULL as STRING) as name,
CAST(null as INT64) as value
)
)
) AS views_total_country,
FROM
`{{ google_analytics3_table_id }}`
WHERE
publication_type = "book"
GROUP BY
publication_id,
release_date,
publication_whole_or_part,
publication_format
),
google_analytics3_metrics AS (
SELECT
publication_id AS ISBN13,
release_date,
STRUCT(
ARRAY_CONCAT_AGG(views_total_country) AS views_total_country,
group_items_google_analytics3(ARRAY_CONCAT_AGG(downloads_total_country)) AS downloads_total_country,
ARRAY_CONCAT_AGG(pdf_book_country) AS downloads_pdf_book_country,
ARRAY_CONCAT_AGG(pdf_chapter_country) AS downloads_pdf_chapter_country,
ARRAY_CONCAT_AGG(html_book_country) AS downloads_html_book_country,
ARRAY_CONCAT_AGG(html_chapter_country) AS downloads_html_chapter_country,
ARRAY_CONCAT_AGG(epub_book_country) AS downloads_epub_book_country,
ARRAY_CONCAT_AGG(epub_chapter_country) AS downloads_epub_chapter_country,
ARRAY_CONCAT_AGG(mobi_book_country) AS downloads_mobi_book_country,
ARRAY_CONCAT_AGG(mobi_chapter_country) AS downloads_mobi_chapter_country
) AS metrics
FROM
google_analytics3_grouped_metrics
GROUP BY
publication_id,
release_date
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
-- Output Schema:
-- name STRING NULLABLE
-- value INTEGER NULLABLE
CREATE TEMP FUNCTION group_items_google_analytics3(items ARRAY < STRUCT < name STRING, value INT64 > >) as (
ARRAY(
(
SELECT
AS STRUCT name,
SUM(value) as value,
FROM
UNNEST(items)
GROUP BY
name
)
)
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
-- The purpose of this block of SQL is to organise the metrics from google book sales for easier consumption of downstream queries.
-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
-- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
google_books_sales_metrics as (
SELECT
Primary_ISBN as ISBN13,
release_date,
STRUCT(
SUM(qty) as qty,
group_items_google_books_sales(ARRAY_AGG(STRUCT(Country_of_Sale, qty))) as countries
) as metrics
FROM
`{{ google_books_sales_table_id }}`
GROUP BY
Primary_ISBN,
release_date
),
-- The purpose of this block of SQL is to organise the Metadata from google book sales for easier consumption of downstream queries.
-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'.
-- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery.
-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
google_books_sales_metadata as (
SELECT
Primary_ISBN as ISBN13,
MAX(Imprint_Name) as Imprint_Name,
MAX(Title) as Title,
MAX(Author) as Author
FROM
`{{ google_books_sales_table_id }}`
GROUP BY
Primary_ISBN
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- The purpose of this block of SQL is to organise the metrics from google book traffic for easier consumption of downstream queries.
-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
-- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery.
-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
google_books_traffic_metrics as (
SELECT
Primary_ISBN as ISBN13,
release_date,
STRUCT(
SUM(Book_Visits_BV_) as Book_Visits_BV_,
SUM(BV_with_Pages_Viewed) as BV_with_Pages_Viewed,
SUM(Non_Unique_Buy_Clicks) as Non_Unique_Buy_Clicks,
SUM(BV_with_Buy_Clicks) as BV_with_Buy_Clicks,
SUM(Buy_Link_CTR) as Buy_Link_CTR,
SUM(Pages_Viewed) as Pages_Viewed
) as metrics
FROM
`{{ google_books_traffic_table_id }}`
GROUP BY
Primary_ISBN,
release_date
),
-- The purpose of this block of SQL is to organise the Metadata from google book traffic for easier consumption of downstream queries.
-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'.
-- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery.
-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
google_books_traffic_metadata as (
SELECT
Primary_ISBN as ISBN13,
MAX(title) as Title
FROM
`{{ google_books_traffic_table_id }}`
GROUP BY
Primary_ISBN
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Output Schema:
-- Country_of_Sale STRING NULLABLE
-- qty INTEGER NULLABLE
CREATE TEMP FUNCTION group_items_google_books_sales(
items ARRAY < STRUCT < Country_of_Sale STRING,
qty INT64 > >
) as (
ARRAY(
(
SELECT
AS STRUCT Country_of_Sale,
SUM(qty) as qty,
FROM
UNNEST(items)
GROUP BY
Country_of_Sale
)
)
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- The purpose of this block of SQL is to organise the metrics from IRUS Fulcrum for easier consumption of downstream queries.
-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
-- This will either point to 'empty_irus_fulcrum' (defined above as an empty row) or the name of the real data table in bigquery.
-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
irus_fulcrum_metrics as (
SELECT
ISBN as ISBN13,
release_date,
STRUCT(
SUM(total_item_investigations) as total_item_investigations,
SUM(total_item_requests) as total_item_requests,
SUM(unique_item_investigations) as unique_item_investigations,
SUM(unique_item_requests) as unique_item_requests,
group_items_irus_fulcrum_country(ARRAY_CONCAT_AGG(country)) as country
) as metrics
FROM
`{{ irus_fulcrum_table_id }}`
GROUP BY
ISBN,
release_date
),
-- The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries.
-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'.
-- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery.
-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query.
-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same.
irus_fulcrum_metadata as (
SELECT
ISBN as ISBN13,
MAX(book_title) as book_title,
MAX(publisher) as publisher
FROM
`{{ irus_fulcrum_table_id }}`
GROUP BY
ISBN
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
-- Output Schema:
-- name STRING NULLABLE
-- code STRING NULLABLE
-- total_item_investigations INTEGER NULLABLE
-- total_item_requests INTEGER NULLABLE
-- unique_item_investigations INTEGER NULLABLE
-- unique_item_requests INTEGER NULLABLE
CREATE TEMP FUNCTION group_items_irus_fulcrum_country(
items ARRAY < STRUCT < name STRING,
code STRING,
total_item_investigations INT64,
total_item_requests INT64,
unique_item_investigations INT64,
unique_item_requests INT64 > >
) as (
ARRAY(
(
SELECT
AS STRUCT name,
MAX(code) as code,
SUM(total_item_investigations) as total_item_investigations,
SUM(total_item_requests) as total_item_requests,
SUM(unique_item_investigations) as unique_item_investigations,
SUM(unique_item_requests) as unique_item_requests
FROM
UNNEST(items)
GROUP BY
name
)
)
);
Loading

0 comments on commit c4b65be

Please sign in to comment.