diff --git a/oaebu_workflows/google_analytics3_telescope/sql/bp_body_google_analytics3.sql.jinja2 b/oaebu_workflows/google_analytics3_telescope/sql/bp_body_google_analytics3.sql.jinja2 new file mode 100644 index 00000000..6f25e70e --- /dev/null +++ b/oaebu_workflows/google_analytics3_telescope/sql/bp_body_google_analytics3.sql.jinja2 @@ -0,0 +1,172 @@ +-- The purpose of this block of SQL is to organise the metrics from google analytics for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_analytics3_table_id'. +-- This will either point to 'empty_google_analytics3' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +google_analytics3_grouped_metrics AS( + SELECT + publication_id, + release_date, + publication_whole_or_part, + publication_format, + IF ( + publication_format = 'PDF' + AND publication_whole_or_part = 'whole', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS pdf_book_country, + IF ( + publication_format = 'PDF' + AND publication_whole_or_part = 'part', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS pdf_chapter_country, + IF ( + publication_format = 'HTML' + AND publication_whole_or_part = 'whole', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS html_book_country, + IF ( + publication_format = 'HTML' + AND publication_whole_or_part = 'part', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS html_chapter_country, + IF ( + publication_format = 'EPUB' + AND publication_whole_or_part = 'whole', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS epub_book_country, + IF ( + publication_format = 'EPUB' + AND publication_whole_or_part = 'part', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS epub_chapter_country, + IF ( + publication_format = 'MOBI' + AND publication_whole_or_part = 'whole', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS mobi_book_country, + IF ( + publication_format = 'MOBI' + AND publication_whole_or_part = 'part', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS mobi_chapter_country, + IF ( + publication_format IN ('PDF', 'HTML', 'EPUB', 'MOBI') + AND publication_whole_or_part IN ('whole', 'part'), + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS downloads_total_country, + IF ( + publication_whole_or_part = '(citation)', + group_items_google_analytics3( + ARRAY_CONCAT_AGG({ { ga3_views_field } }.country) + ), + ARRAY_AGG( + STRUCT( + CAST(NULL as STRING) as name, + CAST(null as INT64) as value + ) + ) + ) AS views_total_country, + FROM + `{{ google_analytics3_table_id }}` + WHERE + publication_type = "book" + GROUP BY + publication_id, + release_date, + publication_whole_or_part, + publication_format +), +google_analytics3_metrics AS ( + SELECT + publication_id AS ISBN13, + release_date, + STRUCT( + ARRAY_CONCAT_AGG(views_total_country) AS views_total_country, + group_items_google_analytics3(ARRAY_CONCAT_AGG(downloads_total_country)) AS downloads_total_country, + ARRAY_CONCAT_AGG(pdf_book_country) AS downloads_pdf_book_country, + ARRAY_CONCAT_AGG(pdf_chapter_country) AS downloads_pdf_chapter_country, + ARRAY_CONCAT_AGG(html_book_country) AS downloads_html_book_country, + ARRAY_CONCAT_AGG(html_chapter_country) AS downloads_html_chapter_country, + ARRAY_CONCAT_AGG(epub_book_country) AS downloads_epub_book_country, + ARRAY_CONCAT_AGG(epub_chapter_country) AS downloads_epub_chapter_country, + ARRAY_CONCAT_AGG(mobi_book_country) AS downloads_mobi_book_country, + ARRAY_CONCAT_AGG(mobi_chapter_country) AS downloads_mobi_chapter_country + ) AS metrics + FROM + google_analytics3_grouped_metrics + GROUP BY + publication_id, + release_date +) \ No newline at end of file diff --git a/oaebu_workflows/google_analytics3_telescope/sql/bp_functions_google_analytics3.sql b/oaebu_workflows/google_analytics3_telescope/sql/bp_functions_google_analytics3.sql new file mode 100644 index 00000000..d1bdf587 --- /dev/null +++ b/oaebu_workflows/google_analytics3_telescope/sql/bp_functions_google_analytics3.sql @@ -0,0 +1,16 @@ +-- Output Schema: +-- name STRING NULLABLE +-- value INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_google_analytics3(items ARRAY < STRUCT < name STRING, value INT64 > >) as ( + ARRAY( + ( + SELECT + AS STRUCT name, + SUM(value) as value, + FROM + UNNEST(items) + GROUP BY + name + ) + ) +); \ No newline at end of file diff --git a/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_sales.sql.jinja2 b/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_sales.sql.jinja2 new file mode 100644 index 00000000..225192e7 --- /dev/null +++ b/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_sales.sql.jinja2 @@ -0,0 +1,35 @@ +-- The purpose of this block of SQL is to organise the metrics from google book sales for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'. +-- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +google_books_sales_metrics as ( + SELECT + Primary_ISBN as ISBN13, + release_date, + STRUCT( + SUM(qty) as qty, + group_items_google_books_sales(ARRAY_AGG(STRUCT(Country_of_Sale, qty))) as countries + ) as metrics + FROM + `{{ google_books_sales_table_id }}` + GROUP BY + Primary_ISBN, + release_date +), +-- The purpose of this block of SQL is to organise the Metadata from google book sales for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'. +-- This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +google_books_sales_metadata as ( + SELECT + Primary_ISBN as ISBN13, + MAX(Imprint_Name) as Imprint_Name, + MAX(Title) as Title, + MAX(Author) as Author + FROM + `{{ google_books_sales_table_id }}` + GROUP BY + Primary_ISBN +) \ No newline at end of file diff --git a/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_traffic.sql.jinja2 b/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_traffic.sql.jinja2 new file mode 100644 index 00000000..12aaa075 --- /dev/null +++ b/oaebu_workflows/google_books_telescope/sql/bp_body_google_books_traffic.sql.jinja2 @@ -0,0 +1,37 @@ +-- The purpose of this block of SQL is to organise the metrics from google book traffic for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'. +-- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +google_books_traffic_metrics as ( + SELECT + Primary_ISBN as ISBN13, + release_date, + STRUCT( + SUM(Book_Visits_BV_) as Book_Visits_BV_, + SUM(BV_with_Pages_Viewed) as BV_with_Pages_Viewed, + SUM(Non_Unique_Buy_Clicks) as Non_Unique_Buy_Clicks, + SUM(BV_with_Buy_Clicks) as BV_with_Buy_Clicks, + SUM(Buy_Link_CTR) as Buy_Link_CTR, + SUM(Pages_Viewed) as Pages_Viewed + ) as metrics + FROM + `{{ google_books_traffic_table_id }}` + GROUP BY + Primary_ISBN, + release_date +), +-- The purpose of this block of SQL is to organise the Metadata from google book traffic for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'. +-- This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +google_books_traffic_metadata as ( + SELECT + Primary_ISBN as ISBN13, + MAX(title) as Title + FROM + `{{ google_books_traffic_table_id }}` + GROUP BY + Primary_ISBN +) \ No newline at end of file diff --git a/oaebu_workflows/google_books_telescope/sql/bp_functions_google_books_sales.sql b/oaebu_workflows/google_books_telescope/sql/bp_functions_google_books_sales.sql new file mode 100644 index 00000000..a396de35 --- /dev/null +++ b/oaebu_workflows/google_books_telescope/sql/bp_functions_google_books_sales.sql @@ -0,0 +1,19 @@ +-- Output Schema: +-- Country_of_Sale STRING NULLABLE +-- qty INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_google_books_sales( + items ARRAY < STRUCT < Country_of_Sale STRING, + qty INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT Country_of_Sale, + SUM(qty) as qty, + FROM + UNNEST(items) + GROUP BY + Country_of_Sale + ) + ) +); \ No newline at end of file diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/bp_body_irus_fulcrum.sql.jinja2 b/oaebu_workflows/irus_fulcrum_telescope/sql/bp_body_irus_fulcrum.sql.jinja2 new file mode 100644 index 00000000..373ca369 --- /dev/null +++ b/oaebu_workflows/irus_fulcrum_telescope/sql/bp_body_irus_fulcrum.sql.jinja2 @@ -0,0 +1,37 @@ +-- The purpose of this block of SQL is to organise the metrics from IRUS Fulcrum for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'. +-- This will either point to 'empty_irus_fulcrum' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +irus_fulcrum_metrics as ( + SELECT + ISBN as ISBN13, + release_date, + STRUCT( + SUM(total_item_investigations) as total_item_investigations, + SUM(total_item_requests) as total_item_requests, + SUM(unique_item_investigations) as unique_item_investigations, + SUM(unique_item_requests) as unique_item_requests, + group_items_irus_fulcrum_country(ARRAY_CONCAT_AGG(country)) as country + ) as metrics + FROM + `{{ irus_fulcrum_table_id }}` + GROUP BY + ISBN, + release_date +), +-- The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'. +-- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +irus_fulcrum_metadata as ( + SELECT + ISBN as ISBN13, + MAX(book_title) as book_title, + MAX(publisher) as publisher + FROM + `{{ irus_fulcrum_table_id }}` + GROUP BY + ISBN +) \ No newline at end of file diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/bp_functions_irus_fulcrum.sql b/oaebu_workflows/irus_fulcrum_telescope/sql/bp_functions_irus_fulcrum.sql new file mode 100644 index 00000000..579d2b7d --- /dev/null +++ b/oaebu_workflows/irus_fulcrum_telescope/sql/bp_functions_irus_fulcrum.sql @@ -0,0 +1,31 @@ +-- Output Schema: +-- name STRING NULLABLE +-- code STRING NULLABLE +-- total_item_investigations INTEGER NULLABLE +-- total_item_requests INTEGER NULLABLE +-- unique_item_investigations INTEGER NULLABLE +-- unique_item_requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_irus_fulcrum_country( + items ARRAY < STRUCT < name STRING, + code STRING, + total_item_investigations INT64, + total_item_requests INT64, + unique_item_investigations INT64, + unique_item_requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT name, + MAX(code) as code, + SUM(total_item_investigations) as total_item_investigations, + SUM(total_item_requests) as total_item_requests, + SUM(unique_item_investigations) as unique_item_investigations, + SUM(unique_item_requests) as unique_item_requests + FROM + UNNEST(items) + GROUP BY + name + ) + ) +); \ No newline at end of file diff --git a/oaebu_workflows/irus_oapen_telescope/sql/bp_body_irus_oapen.sql.jinja2 b/oaebu_workflows/irus_oapen_telescope/sql/bp_body_irus_oapen.sql.jinja2 new file mode 100644 index 00000000..136a92a4 --- /dev/null +++ b/oaebu_workflows/irus_oapen_telescope/sql/bp_body_irus_oapen.sql.jinja2 @@ -0,0 +1,40 @@ +-- The purpose of this block of SQL is to organise the metrics from IRUS OAPEN for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'oapen_table_id'. +-- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +irus_oapen_metrics as ( + SELECT + ISBN as ISBN13, + release_date, + STRUCT( + MAX(version) as version, + SUM(title_requests) as title_requests, + SUM(total_item_investigations) as total_item_investigations, + SUM(total_item_requests) as total_item_requests, + SUM(unique_item_investigations) as unique_item_investigations, + SUM(unique_item_requests) as unique_item_requests, + group_items_irus_country(ARRAY_CONCAT_AGG(country)) as country, + group_items_irus_location(ARRAY_CONCAT_AGG(locations)) as locations + ) as metrics + FROM + `{{ irus_oapen_table_id }}` + GROUP BY + ISBN, + release_date +), +-- The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_oapen_table_id'. +-- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +irus_oapen_metadata as ( + SELECT + ISBN as ISBN13, + MAX(book_title) as book_title, + MAX(publisher) as publisher + FROM + `{{ irus_oapen_table_id }}` + GROUP BY + ISBN +), \ No newline at end of file diff --git a/oaebu_workflows/irus_oapen_telescope/sql/bp_functions_irus_oapen.sql b/oaebu_workflows/irus_oapen_telescope/sql/bp_functions_irus_oapen.sql new file mode 100644 index 00000000..e0e51917 --- /dev/null +++ b/oaebu_workflows/irus_oapen_telescope/sql/bp_functions_irus_oapen.sql @@ -0,0 +1,78 @@ +-- Output Schema: +-- name STRING NULLABLE +-- code STRING NULLABLE +-- title_requests INTEGER NULLABLE +-- total_item_investigations INTEGER NULLABLE +-- total_item_requests INTEGER NULLABLE +-- unique_item_investigations INTEGER NULLABLE +-- unique_item_requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_irus_country( + items ARRAY < STRUCT < name STRING, + code STRING, + title_requests INT64, + total_item_investigations INT64, + total_item_requests INT64, + unique_item_investigations INT64, + unique_item_requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT name, + MAX(code) as code, + SUM(title_requests) as title_requests, + SUM(total_item_investigations) as total_item_investigations, + SUM(total_item_requests) as total_item_requests, + SUM(unique_item_investigations) as unique_item_investigations, + SUM(unique_item_requests) as unique_item_requests + FROM + UNNEST(items) + GROUP BY + name + ) + ) +); + +-- Output Schema: +-- latitude FLOAT NULLABLE +-- longitude FLOAT NULLABLE +-- city STRING NULLABLE +-- country_name STRING NULLABLE +-- country_code STRING NULLABLE +-- title_requests INTEGER NULLABLE +-- total_item_investigations INTEGER NULLABLE +-- total_item_requests INTEGER NULLABLE +-- unique_item_investigations INTEGER NULLABLE +-- unique_item_requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_irus_location( + items ARRAY < STRUCT < latitude FLOAT64, + longitude FLOAT64, + city STRING, + country_name STRING, + country_code STRING, + title_requests INT64, + total_item_investigations INT64, + total_item_requests INT64, + unique_item_investigations INT64, + unique_item_requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT MAX(latitude) as latitude, + MAX(longitude) as longitude, + city, + MAX(country_name) as country_name, + MAX(country_code) as country_code, + SUM(title_requests) as title_requests, + SUM(total_item_investigations) as total_item_investigations, + SUM(total_item_requests) as total_item_requests, + SUM(unique_item_investigations) as unique_item_investigations, + SUM(unique_item_requests) as unique_item_requests + FROM + UNNEST(items) + GROUP BY + city + ) + ) +); \ No newline at end of file diff --git a/oaebu_workflows/jstor_telescope/sql/bp_body_jstor_country.sql.jinja2 b/oaebu_workflows/jstor_telescope/sql/bp_body_jstor_country.sql.jinja2 new file mode 100644 index 00000000..25f3cb95 --- /dev/null +++ b/oaebu_workflows/jstor_telescope/sql/bp_body_jstor_country.sql.jinja2 @@ -0,0 +1,39 @@ +-- The purpose of this block of SQL is to organise the metrics from JSTOR country for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'. +-- This will either point to 'empty_jstor_country' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +jstor_country_metrics as ( + SELECT + eISBN as ISBN13, + release_date, + group_items_jstor_country( + ARRAY_AGG(STRUCT(Country_name, Total_Item_Requests)) + ) as metrics + FROM + `{{ jstor_country_table_id }}` + GROUP BY + eISBN, + release_date +), +-- The purpose of this block of SQL is to organise the Metadata from JSTOR country for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'. +-- This will either point to 'empty_jstor_country' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +jstor_country_metadata as ( + SELECT + eISBN as ISBN13, + MAX(Book_Title) as Book_Title, + MAX(Book_ID) as Book_ID, + MAX(Authors) as Authors, + MAX(ISBN) as ISBN, + eISBN, + MAX(Copyright_Year) as Copyright_Year, + MAX(Disciplines) as Disciplines, + MAX(Usage_Type) as Usage_Type + FROM + `{{ jstor_country_table_id }}` + GROUP BY + eISBN +), \ No newline at end of file diff --git a/oaebu_workflows/jstor_telescope/sql/bp_body_jstor_institution.sql.jinja2 b/oaebu_workflows/jstor_telescope/sql/bp_body_jstor_institution.sql.jinja2 new file mode 100644 index 00000000..d2a09875 --- /dev/null +++ b/oaebu_workflows/jstor_telescope/sql/bp_body_jstor_institution.sql.jinja2 @@ -0,0 +1,39 @@ +-- The purpose of this block of SQL is to organise the metrics from JSTOR institution for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'. +-- This will either point to 'empty_jstor_institution' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +jstor_institution_metrics as ( + SELECT + eISBN as ISBN13, + release_date, + group_items_jstor_institution( + ARRAY_AGG(STRUCT(Institution, Total_Item_Requests)) + ) as metrics + FROM + `{{ jstor_institution_table_id }}` + GROUP BY + eISBN, + release_date +), +-- The purpose of this block of SQL is to organise the Metadata from JSTOR institution for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'. +-- This will either point to 'empty_jstor_institution' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +jstor_institution_metadata as ( + SELECT + eISBN as ISBN13, + MAX(Book_Title) as Book_Title, + MAX(Book_ID) as Book_ID, + MAX(Authors) as Authors, + MAX(ISBN) as ISBN, + eISBN, + MAX(Copyright_Year) as Copyright_Year, + MAX(Disciplines) as Disciplines, + MAX(Usage_Type) as Usage_Type + FROM + `{{ jstor_institution_table_id }}` + GROUP BY + eISBN +), \ No newline at end of file diff --git a/oaebu_workflows/jstor_telescope/sql/bp_functions_jstor_country.sql b/oaebu_workflows/jstor_telescope/sql/bp_functions_jstor_country.sql new file mode 100644 index 00000000..820e0374 --- /dev/null +++ b/oaebu_workflows/jstor_telescope/sql/bp_functions_jstor_country.sql @@ -0,0 +1,19 @@ +-- Output Schema: +-- Country_name STRING NULLABLE +-- Total_Item_Requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_jstor_country( + items ARRAY < STRUCT < Country_name STRING, + Total_Item_Requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT Country_name, + SUM(Total_Item_Requests) as Total_Item_Requests, + FROM + UNNEST(items) + GROUP BY + Country_name + ) + ) +); \ No newline at end of file diff --git a/oaebu_workflows/jstor_telescope/sql/bp_functions_jstor_institution.sql b/oaebu_workflows/jstor_telescope/sql/bp_functions_jstor_institution.sql new file mode 100644 index 00000000..ccbdd874 --- /dev/null +++ b/oaebu_workflows/jstor_telescope/sql/bp_functions_jstor_institution.sql @@ -0,0 +1,19 @@ +-- Output Schema: +-- Institution STRING NULLABLE +-- Total_Item_Requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_jstor_institution( + items ARRAY < STRUCT < Institution STRING, + Total_Item_Requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT Institution, + SUM(Total_Item_Requests) as Total_Item_Requests, + FROM + UNNEST(items) + GROUP BY + Institution + ) + ) +); \ No newline at end of file diff --git a/oaebu_workflows/oaebu_partners.py b/oaebu_workflows/oaebu_partners.py index d4425bac..f46f4bab 100644 --- a/oaebu_workflows/oaebu_partners.py +++ b/oaebu_workflows/oaebu_partners.py @@ -23,6 +23,12 @@ @dataclass +class BookProductSQL: + temp_functions: str = None + body: str = None + + +@dataclass(kw_only=True) class OaebuPartner: """Class for storing information about data sources we are using to produce oaebu intermediate tables for. @@ -32,6 +38,7 @@ class OaebuPartner: :param isbn_field_name: Name of the field containing the ISBN. :param title_field_name: Name of the field containing the Title. :param sharded: whether the table is sharded or not. + :param schema_path: The path of the partner's schema folder. """ type_id: str @@ -46,6 +53,18 @@ def __str__(self): return self.type_id +@dataclass(kw_only=True) +class DataPartner(OaebuPartner): + """Represents a data class for an oaebu data source. + + :param bp_temp_functions: The name of the book product temp functions file. + :param bp_body: The name of the book product body file. + """ + + bp_body: str + bp_temp_functions: str = None + + OAEBU_METADATA_PARTNERS = dict( onix=OaebuPartner( type_id="onix", @@ -77,7 +96,7 @@ def __str__(self): ) OAEBU_DATA_PARTNERS = dict( - google_analytics3=OaebuPartner( + google_analytics3=DataPartner( type_id="google_analytics3", bq_dataset_id="google", bq_table_name="google_analytics3", @@ -85,8 +104,10 @@ def __str__(self): title_field_name="title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="google_analytics3_telescope"), "google_analytics.json"), + bp_temp_functions="bp_functions_google_analytics3.sql", + bp_body="bp_body_google_analytics3.sql.jinja2", ), - google_books_sales=OaebuPartner( + google_books_sales=DataPartner( type_id="google_books_sales", bq_dataset_id="google", bq_table_name="google_books_sales", @@ -94,8 +115,10 @@ def __str__(self): title_field_name="Title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="google_books_telescope"), "google_books_sales.json"), + bp_temp_functions="bp_functions_google_books_sales.sql", + bp_body="bp_body_google_books_sales.sql.jinja2", ), - google_books_traffic=OaebuPartner( + google_books_traffic=DataPartner( type_id="google_books_traffic", bq_dataset_id="google", bq_table_name="google_books_traffic", @@ -103,8 +126,9 @@ def __str__(self): title_field_name="Title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="google_books_telescope"), "google_books_traffic.json"), + bp_body="bp_body_google_books_traffic.sql.jinja2", ), - jstor_country=OaebuPartner( + jstor_country=DataPartner( type_id="jstor_country", bq_dataset_id="jstor", bq_table_name="jstor_country", @@ -112,8 +136,10 @@ def __str__(self): title_field_name="Book_Title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="jstor_telescope"), "jstor_country.json"), + bp_temp_functions="bp_functions_jstor_country.sql", + bp_body="bp_body_jstor_country.sql.jinja2", ), - jstor_institution=OaebuPartner( + jstor_institution=DataPartner( type_id="jstor_institution", bq_dataset_id="jstor", bq_table_name="jstor_institution", @@ -121,8 +147,10 @@ def __str__(self): title_field_name="Book_Title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="jstor_telescope"), "jstor_institution.json"), + bp_temp_functions="bp_functions_jstor_institution.sql", + bp_body="bp_body_jstor_institution.sql.jinja2", ), - jstor_country_collection=OaebuPartner( + jstor_country_collection=DataPartner( type_id="jstor_country_collection", bq_dataset_id="jstor", bq_table_name="jstor_country", @@ -130,8 +158,10 @@ def __str__(self): title_field_name="Book_Title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="jstor_telescope"), "jstor_country_collection.json"), + bp_temp_functions="bp_functions_jstor_country_collection.sql", + bp_body="bp_body_jstor_country_collection.sql.jinja2", ), - jstor_institution_collection=OaebuPartner( + jstor_institution_collection=DataPartner( type_id="jstor_institution_collection", bq_dataset_id="jstor", bq_table_name="jstor_institution", @@ -139,8 +169,10 @@ def __str__(self): title_field_name="Book_Title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="jstor_telescope"), "jstor_institution_collection.json"), + bp_temp_functions="bp_functions_jstor_institution_collection.sql", + bp_body="bp_body_jstor_institution_collection.sql.jinja2", ), - irus_oapen=OaebuPartner( + irus_oapen=DataPartner( type_id="irus_oapen", bq_dataset_id="irus", bq_table_name="irus_oapen", @@ -148,8 +180,10 @@ def __str__(self): title_field_name="book_title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="irus_oapen_telescope"), "irus_oapen.json"), + bp_temp_functions="bp_functions_irus_oapen.sql", + bp_body="bp_body_irus_oapen.sql.jinja2", ), - irus_fulcrum=OaebuPartner( + irus_fulcrum=DataPartner( type_id="irus_fulcrum", bq_dataset_id="irus", bq_table_name="irus_fulcrum", @@ -157,8 +191,10 @@ def __str__(self): title_field_name="book_title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="irus_fulcrum_telescope"), "irus_fulcrum.json"), + bp_temp_functions="bp_functions_irus_fulcrum.sql", + bp_body="bp_body_irus_fulcrum.sql.jinja2", ), - ucl_discovery=OaebuPartner( + ucl_discovery=DataPartner( type_id="ucl_discovery", bq_dataset_id="ucl", bq_table_name="ucl_discovery", @@ -166,8 +202,9 @@ def __str__(self): title_field_name="title", sharded=False, schema_path=os.path.join(schema_folder(workflow_module="ucl_discovery_telescope"), "ucl_discovery.json"), + bp_body="bp_body_ucl_discovery.sql.jinja2", ), - internet_archive=OaebuPartner( + internet_archive=DataPartner( type_id="internet_archive", bq_dataset_id="internet_archive", bq_table_name="internet_archive", @@ -175,8 +212,10 @@ def __str__(self): title_field_name="title", sharded=False, schema_path=os.path.join(schema_folder(), "internet_archive", "internet_archive.json"), + bp_temp_functions="bp_functions_internet_archive.sql", + bp_body="bp_body_internet_archive.sql.jinja2", ), - worldreader=OaebuPartner( + worldreader=DataPartner( type_id="worldreader", bq_dataset_id="worldreader", bq_table_name="worldreader", @@ -184,6 +223,8 @@ def __str__(self): title_field_name="title", sharded=False, schema_path=os.path.join(schema_folder(), "worldreader", "worldreader.json"), + bp_temp_functions="bp_functions_worldreader.sql", + bp_body="bp_body_worldreader.sql.jinja2", ), ) diff --git a/oaebu_workflows/onix_workflow/onix_workflow.py b/oaebu_workflows/onix_workflow/onix_workflow.py index 9bba4416..68f2341d 100644 --- a/oaebu_workflows/onix_workflow/onix_workflow.py +++ b/oaebu_workflows/onix_workflow/onix_workflow.py @@ -630,95 +630,16 @@ def create_oaebu_book_product_table( location=self.cloud_workspace.data_location, description="OAEBU Tables", ) - data_partner_datasets = {data.type_id: data.bq_dataset_id for data in self.data_partners} - google_analytics_dataset = data_partner_datasets.get("google_analytics3") - google_books_dataset = data_partner_datasets.get("google_books_traffic") - jstor_dataset = data_partner_datasets.get("jstor_country") - irus_oapen_dataset = data_partner_datasets.get("irus_oapen") - irus_fulcrum_dataset = data_partner_datasets.get("irus_fulcrum") - ucl_discovery_dataset = data_partner_datasets.get("ucl_discovery") - internet_archive_dataset = data_partner_datasets.get("internet_archive") - worldreader_dataset = data_partner_datasets.get("worldreader") - - # Create matched tables for supplied data partners - google_analytics3_table_id = "empty_google_analytics3" - google_books_sales_table_id = "empty_google_books_sales" - google_books_traffic_table_id = "empty_google_books_traffic" - jstor_country_table_id = "empty_jstor_country" - jstor_institution_table_id = "empty_jstor_institution" - irus_oapen_table_id = "empty_irus_oapen" - irus_fulcrum_table_id = "empty_irus_fulcrum" - ucl_discovery_table_id = "empty_ucl_discovery" - internet_archive_table_id = "empty_internet_archive" - worldreader_table_id = "empty_worldreader" - if google_analytics_dataset: - google_analytics3_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "google_analytics3_matched", - release.snapshot_date, - ) - if google_books_dataset: - google_books_sales_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "google_books_sales_matched", - release.snapshot_date, - ) - google_books_traffic_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "google_books_traffic_matched", - release.snapshot_date, - ) - if jstor_dataset: - jstor_country_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "jstor_country_matched", - release.snapshot_date, - ) - jstor_institution_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "jstor_institution_matched", - release.snapshot_date, - ) - if irus_oapen_dataset: - irus_oapen_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "irus_oapen_matched", - release.snapshot_date, - ) - if ucl_discovery_dataset: - ucl_discovery_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "ucl_discovery_matched", - release.snapshot_date, - ) - if irus_fulcrum_dataset: - irus_fulcrum_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "irus_fulcrum_matched", - release.snapshot_date, - ) - if internet_archive_dataset: - internet_archive_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - "internet_archive_matched", - release.snapshot_date, - ) - if worldreader_dataset: - worldreader_table_id = bq_sharded_table_id( + + dp_tables = { + f"{dp.type_id}_table_id": bq_sharded_table_id( self.cloud_workspace.project_id, self.bq_oaebu_intermediate_dataset, - "worldreader_matched", + f"{dp.type_id}_matched", release.snapshot_date, ) + for dp in self.data_partners + } workid_table_id = bq_sharded_table_id( self.cloud_workspace.project_id, self.bq_onix_workflow_dataset, @@ -745,22 +666,13 @@ def create_oaebu_book_product_table( sql = render_template( template_path, onix_table_id=onix_table_id, - google_analytics3_table_id=google_analytics3_table_id, - google_books_sales_table_id=google_books_sales_table_id, - google_books_traffic_table_id=google_books_traffic_table_id, - jstor_country_table_id=jstor_country_table_id, - jstor_institution_table_id=jstor_institution_table_id, - irus_oapen_table_id=irus_oapen_table_id, - irus_fulcrum_table_id=irus_fulcrum_table_id, - ucl_discovery_table_id=ucl_discovery_table_id, - internet_archive_table_id=internet_archive_table_id, - worldreader_table_id=worldreader_table_id, + data_partners=self.data_partners, book_table_id=book_table_id, country_table_id=country_table_id, workid_table_id=workid_table_id, workfamilyid_table_id=workfamilyid_table_id, ga3_views_field=self.ga3_views_field, - onix_workflow=True, + **dp_tables, ) schema_file_path = bq_find_schema(path=self.schema_folder, table_name=self.bq_book_product_table_name) table_id = bq_sharded_table_id( diff --git a/oaebu_workflows/onix_workflow/sql/create_book.sql.jinja2 b/oaebu_workflows/onix_workflow/sql/create_book.sql.jinja2 index 32ce4bf7..027b4256 100644 --- a/oaebu_workflows/onix_workflow/sql/create_book.sql.jinja2 +++ b/oaebu_workflows/onix_workflow/sql/create_book.sql.jinja2 @@ -1,4 +1,4 @@ -{# Copyright 2020 Curtin University +{ # Copyright 2020 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,13 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # Author: Richard Hosking, Keegan Smith #} - -- Convert 10-digit ISBNs to 13-digit -CREATE TEMPORARY FUNCTION normalise_isbn_string(input STRING) -RETURNS STRING - LANGUAGE js AS r""" +CREATE TEMPORARY FUNCTION normalise_isbn_string(input STRING) RETURNS STRING LANGUAGE js AS r """ function calc_isbn13_check_digit(isbn13) { var mask = [1,3,1,3,1,3,1,3,1,3,1,3]; @@ -83,7 +79,7 @@ RETURNS STRING } function convert_isbn10_to_isbn13(isbn10) { - let isbn = "978" + isbn10.slice(0, 9); + let isbn = " 978 " + isbn10.slice(0, 9); let check_digit = calc_isbn13_check_digit(isbn); isbn += String(check_digit); return isbn; @@ -107,55 +103,121 @@ RETURNS STRING return null; """; -WITH - --- Intermediate crossref events table +WITH -- Intermediate crossref events table temp_intermediate_crossref_events as ( -SELECT - doi, - ARRAY(SELECT as STRUCT source, SUM(count) as count FROM UNNEST(months) GROUP BY source) as events, - months, - ARRAY(SELECT as STRUCT CAST(SPLIT(month, "-")[SAFE_OFFSET(0)] as int64) as year, source, SUM(count) as count FROM UNNEST(months) GROUP BY year, source) as years -FROM ( -SELECT - doi, - ARRAY_AGG(STRUCT(month, source, count)) as months -FROM ( SELECT - (UPPER(TRIM(SUBSTR(obj_id, 17)))) as doi, - safe.FORMAT_TIMESTAMP('%Y-%m', occurred_at) as month, - source_id as source, - COUNT(id) as count - FROM `{{ crossref_events_table_id }}` - WHERE safe.FORMAT_TIMESTAMP('%Y-%m', occurred_at) is not null - GROUP BY - doi, source_id, month) -GROUP BY doi -)), - - + doi, + ARRAY( + SELECT + as STRUCT source, + SUM(count) as count + FROM + UNNEST(months) + GROUP BY + source + ) as events, + months, + ARRAY( + SELECT + as STRUCT CAST(SPLIT(month, "-") [SAFE_OFFSET(0)] as int64) as year, + source, + SUM(count) as count + FROM + UNNEST(months) + GROUP BY + year, + source + ) as years + FROM + ( + SELECT + doi, + ARRAY_AGG(STRUCT(month, source, count)) as months + FROM + ( + SELECT + (UPPER(TRIM(SUBSTR(obj_id, 17)))) as doi, + safe.FORMAT_TIMESTAMP('%Y-%m', occurred_at) as month, + source_id as source, + COUNT(id) as count + FROM + `{{ crossref_events_table_id }}` + WHERE + safe.FORMAT_TIMESTAMP('%Y-%m', occurred_at) is not null + GROUP BY + doi, + source_id, + month + ) + GROUP BY + doi + ) +), -- Temp DOI table dois_temp_table as ( -SELECT - * -FROM - (SELECT - UPPER(TRIM(ref.doi)) as doi, - STRUCT( - title, abstract, issued.date_parts[offset(0)] as published_year, - CASE WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts[offset(1)] ELSE 13 END as published_month, - CONCAT(issued.date_parts[offset(0)], "-", CASE WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts[offset(1)] ELSE 13 END) as published_year_month, - type, ISSN, ISBN, issn_type, publisher_location, publisher, member, prefix, container_title, short_container_title, group_title, references_count, - is_referenced_by_count, subject, published_print, license, volume, funder, page, author, link, clinical_trial_number, alternative_id - ) as crossref, - (SELECT as STRUCT * from temp_intermediate_crossref_events as events WHERE events.doi = UPPER(TRIM(ref.doi))) as events - FROM `{{ crossref_metadata_table_id }}` as ref - WHERE ARRAY_LENGTH(issued.date_parts) > 0)), - - + SELECT + * + FROM + ( + SELECT + UPPER(TRIM(ref.doi)) as doi, + STRUCT( + title, + abstract, + issued.date_parts [offset(0)] as published_year, + CASE + WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts [offset(1)] + ELSE 13 + END as published_month, + CONCAT( + issued.date_parts [offset(0)], + "-", + CASE + WHEN ARRAY_LENGTH(issued.date_parts) > 1 THEN issued.date_parts [offset(1)] + ELSE 13 + END + ) as published_year_month, + type, + ISSN, + ISBN, + issn_type, + publisher_location, + publisher, + member, + prefix, + container_title, + short_container_title, + group_title, + references_count, + is_referenced_by_count, + subject, + published_print, + license, + volume, + funder, + page, + author, + link, + clinical_trial_number, + alternative_id + ) as crossref, + ( + SELECT + as STRUCT * + from + temp_intermediate_crossref_events as events + WHERE + events.doi = UPPER(TRIM(ref.doi)) + ) as events + FROM + `{{ crossref_metadata_table_id }}` as ref + WHERE + ARRAY_LENGTH(issued.date_parts) > 0 + ) +), -- Temp ISBN table isbns as ( -SELECT + SELECT normalise_isbn_string(isbn) as isbn, doi, crossref.title, @@ -163,65 +225,148 @@ SELECT crossref.publisher, crossref.published_year, crossref.published_year_month, - (SELECT ARRAY_AGG(normalise_isbn_string(candidate_isbn) IGNORE NULLS) FROM UNNEST(crossref.isbn) as candidate_isbn) as work_isbns - FROM dois_temp_table as original, UNNEST(crossref.ISBN) as isbn), - - + ( + SELECT + ARRAY_AGG( + normalise_isbn_string(candidate_isbn) IGNORE NULLS + ) + FROM + UNNEST(crossref.isbn) as candidate_isbn + ) as work_isbns + FROM + dois_temp_table as original, + UNNEST(crossref.ISBN) as isbn +), -- Temp books books as ( -SELECT + SELECT book.isbn, book.crossref_objects, book_part.chapters -FROM ( -SELECT - isbn, - ARRAY_AGG(STRUCT(doi, title, type, publisher, published_year, published_year_month, work_isbns)) as crossref_objects - FROM isbns - WHERE type in ("book", "monograph") AND isbn IS NOT NULL - GROUP BY isbn) as book - left join ( - SELECT + FROM + ( + SELECT isbn, - ARRAY_AGG(STRUCT(doi, title, type) ORDER BY doi) as chapters - FROM isbns - WHERE type in ("book-chapter") AND isbn IS NOT NULL - GROUP BY isbn - ) as book_part on book.isbn = book_part.isbn), - - + ARRAY_AGG( + STRUCT( + doi, + title, + type, + publisher, + published_year, + published_year_month, + work_isbns + ) + ) as crossref_objects + FROM + isbns + WHERE + type in ("book", "monograph") + AND isbn IS NOT NULL + GROUP BY + isbn + ) as book + left join ( + SELECT + isbn, + ARRAY_AGG( + STRUCT(doi, title, type) + ORDER BY + doi + ) as chapters + FROM + isbns + WHERE + type in ("book-chapter") + AND isbn IS NOT NULL + GROUP BY + isbn + ) as book_part on book.isbn = book_part.isbn +), -- Temp aggregated events -ISBN_DOI as (SELECT - ISBN, - ARRAY_CONCAT(ARRAY(SELECT doi FROM book.crossref_objects), ARRAY(SELECT doi FROM book.chapters)) as dois -FROM books as book), - -events_matched as (SELECT - ISBN, - ARRAY_CONCAT_AGG(crossref_events.events) as events, - ARRAY_CONCAT_AGG(crossref_events.months) as months, - ARRAY_CONCAT_AGG(crossref_events.years) as years, -FROM ISBN_DOI, UNNEST(dois) as doi -LEFT JOIN temp_intermediate_crossref_events as crossref_events ON crossref_events.doi = doi -GROUP BY ISBN), - -events_aggregated as (SELECT - ISBN, - ARRAY(SELECT as STRUCT source, SUM(count) as count FROM UNNEST(events) GROUP BY source) as overall, - ARRAY(SELECT as STRUCT month, source, SUM(count) as count FROM UNNEST(months) GROUP BY month, source ORDER BY month DESC) as months, - ARRAY(SELECT as STRUCT year, source, SUM(count) as count FROM UNNEST(years) GROUP BY year, source ORDER BY year DESC) as years -FROM events_matched) - - --- Main Query +ISBN_DOI as ( + SELECT + ISBN, + ARRAY_CONCAT( + ARRAY( + SELECT + doi + FROM + book.crossref_objects + ), + ARRAY( + SELECT + doi + FROM + book.chapters + ) + ) as dois + FROM + books as book +), +events_matched as ( + SELECT + ISBN, + ARRAY_CONCAT_AGG(crossref_events.events) as events, + ARRAY_CONCAT_AGG(crossref_events.months) as months, + ARRAY_CONCAT_AGG(crossref_events.years) as years, + FROM + ISBN_DOI, + UNNEST(dois) as doi + LEFT JOIN temp_intermediate_crossref_events as crossref_events ON crossref_events.doi = doi + GROUP BY + ISBN +), +events_aggregated as ( + SELECT + ISBN, + ARRAY( + SELECT + as STRUCT source, + SUM(count) as count + FROM + UNNEST(events) + GROUP BY + source + ) as overall, + ARRAY( + SELECT + as STRUCT month, + source, + SUM(count) as count + FROM + UNNEST(months) + GROUP BY + month, + source + ORDER BY + month DESC + ) as months, + ARRAY( + SELECT + as STRUCT year, + source, + SUM(count) as count + FROM + UNNEST(years) + GROUP BY + year, + source + ORDER BY + year DESC + ) as years + FROM + events_matched +) -- Main Query SELECT - book.isbn, - book.crossref_objects, - book.chapters, - STRUCT( - events.overall, - events.months, - events.years - ) as events -FROM books as book -LEFT join events_aggregated as events on events.ISBN = book.ISBN \ No newline at end of file + book.isbn, + book.crossref_objects, + book.chapters, + STRUCT( + events.overall, + events.months, + events.years + ) as events +FROM + books as book + LEFT join events_aggregated as events on events.ISBN = book.ISBN \ No newline at end of file diff --git a/oaebu_workflows/onix_workflow/sql/create_book_products.sql.jinja2 b/oaebu_workflows/onix_workflow/sql/create_book_products.sql.jinja2 index ebb61d54..fa376646 100644 --- a/oaebu_workflows/onix_workflow/sql/create_book_products.sql.jinja2 +++ b/oaebu_workflows/onix_workflow/sql/create_book_products.sql.jinja2 @@ -12,172 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Author: Richard Hosking #} +# Author: Richard Hosking, Keegan Smith #} -# Helper Function: -{# -Output Schema: -name STRING NULLABLE -value INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_google_analytics3(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - name, - SUM(value) as value, - FROM UNNEST(items) - GROUP BY name) - ) -); - -# Helper Function: -{# -Output Schema: -Country_of_Sale STRING NULLABLE -qty INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_google_books_sales(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - Country_of_Sale, - SUM(qty) as qty, - FROM UNNEST(items) - GROUP BY Country_of_Sale) - ) -); - -# Helper Function: -{# -Output Schema: -Country_name STRING NULLABLE -Total_Item_Requests INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_jstor_country(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - Country_name, - SUM(Total_Item_Requests) as Total_Item_Requests, - FROM UNNEST(items) - GROUP BY Country_name) - ) -); - -# Helper Function: -{# -Output Schema: -Institution STRING NULLABLE -Total_Item_Requests INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_jstor_institution(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - Institution, - SUM(Total_Item_Requests) as Total_Item_Requests, - FROM UNNEST(items) - GROUP BY Institution) - ) -); - -# Helper Function: -{# -Output Schema: -name STRING NULLABLE -code STRING NULLABLE -title_requests INTEGER NULLABLE -total_item_investigations INTEGER NULLABLE -total_item_requests INTEGER NULLABLE -unique_item_investigations INTEGER NULLABLE -unique_item_requests INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_irus_country(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - name, - MAX(code) as code, - SUM(title_requests) as title_requests, - SUM(total_item_investigations) as total_item_investigations, - SUM(total_item_requests) as total_item_requests, - SUM(unique_item_investigations) as unique_item_investigations, - SUM(unique_item_requests) as unique_item_requests - FROM UNNEST(items) - GROUP BY name) - ) -); - -# Helper Function: -{# -Output Schema: -name STRING NULLABLE -code STRING NULLABLE -total_item_investigations INTEGER NULLABLE -total_item_requests INTEGER NULLABLE -unique_item_investigations INTEGER NULLABLE -unique_item_requests INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_irus_fulcrum_country(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - name, - MAX(code) as code, - SUM(total_item_investigations) as total_item_investigations, - SUM(total_item_requests) as total_item_requests, - SUM(unique_item_investigations) as unique_item_investigations, - SUM(unique_item_requests) as unique_item_requests - FROM UNNEST(items) - GROUP BY name) - ) -); - -# Helper Function: -{# -Output Schema: -latitude FLOAT NULLABLE -longitude FLOAT NULLABLE -city STRING NULLABLE -country_name STRING NULLABLE -country_code STRING NULLABLE -title_requests INTEGER NULLABLE -total_item_investigations INTEGER NULLABLE -total_item_requests INTEGER NULLABLE -unique_item_investigations INTEGER NULLABLE -unique_item_requests INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_irus_location(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - MAX(latitude) as latitude, - MAX(longitude) as longitude, - city, - MAX(country_name) as country_name, - MAX(country_code) as country_code, - SUM(title_requests) as title_requests, - SUM(total_item_investigations) as total_item_investigations, - SUM(total_item_requests) as total_item_requests, - SUM(unique_item_investigations) as unique_item_investigations, - SUM(unique_item_requests) as unique_item_requests - FROM UNNEST(items) - GROUP BY city) - ) -); - -# Helper Function: -{# -Output Schema: -name STRING NULLABLE -code STRING NULLABLE -namew INTEGER NULLABLE -downloads INTEGER NULLABLE -#} -CREATE TEMP FUNCTION group_items_worldreader_country(items ARRAY>) as ( - ARRAY( - (SELECT AS STRUCT - country_name, - MAX(code) as country_code, - SUM(downloads) as downloads - FROM UNNEST(items) - GROUP BY country_name) - ) -); +# Add data partner temp functions +{% for dp in data_partners %} + {% if dp.bp_temp_functions %} + {% include dp.schema_path + "/" + dp.bp_temp_functions %} + {% endif %} +{% endfor %} # Helper Function: {# @@ -192,223 +34,6 @@ CREATE TEMP FUNCTION custom_split(str STRING, sep STRING) AS ( ) ); -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the OAPEN raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_oapen_table_id'. -This will either point to 'empty_irus_oapen' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -WITH empty_irus_oapen as ( - SELECT - CAST(null as STRING) as ISBN, - CAST(null as DATE) as release_date, - CAST(null as STRING) as book_title, - CAST(null as STRING) as publisher, - CAST(null as STRING) as version, - CAST(null as INT64) as title_requests, - CAST(null as INT64) as total_item_investigations, - CAST(null as INT64) as total_item_requests, - CAST(null as INT64) as unique_item_investigations, - CAST(null as INT64) as unique_item_requests, - [STRUCT( - CAST(null as STRING) as name, CAST(null as STRING) as code, CAST(null as INT64) as title_requests, CAST(null as INT64) as total_item_investigations, - CAST(null as INT64) as total_item_requests, CAST(null as INT64) as unique_item_investigations, CAST(null as INT64) as unique_item_requests - )] as country, - [STRUCT( - CAST(null as FLOAT64) as latitude, CAST(null as FLOAT64) as longitude, CAST(null as STRING) as city, CAST(null as STRING) as country_name, - CAST(null as STRING) as country_code, CAST(null as INT64) as title_requests, CAST(null as INT64) as total_item_investigations, - null as total_item_requests, CAST(null as INT64) as unique_item_investigations, CAST(null as INT64) as unique_item_requests - )] as locations -), - -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the IRUS Fulcrum raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'. -This will either point to 'empty_irus_fulcrum' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -empty_irus_fulcrum as ( - SELECT - CAST(null as STRING) as ISBN, - CAST(null as DATE) as release_date, - CAST(null as STRING) as book_title, - CAST(null as STRING) as publisher, - CAST(null as INT64) as total_item_investigations, - CAST(null as INT64) as total_item_requests, - CAST(null as INT64) as unique_item_investigations, - CAST(null as INT64) as unique_item_requests, - [STRUCT( - CAST(null as STRING) as name, CAST(null as STRING) as code, CAST(null as INT64) as total_item_investigations, - CAST(null as INT64) as total_item_requests, CAST(null as INT64) as unique_item_investigations, CAST(null as INT64) as unique_item_requests - )] as country, -), - -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the jstor institution raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'. -This will either point to 'empty_jstor_institution' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -empty_jstor_institution as ( - SELECT - Cast(null as STRING) as eISBN, - Cast(null as STRING) as ISBN, - CAST(null as Date) as release_date, - Cast(null as STRING) as Book_Title, - Cast(null as STRING) as Book_ID, - Cast(null as STRING) as Authors, - Cast(null as INT64) as Copyright_Year, - Cast(null as STRING) as Disciplines, - Cast(null as STRING) as Usage_Type, - Cast(null as STRING) as Institution, - Cast(null as INT64) as Total_Item_Requests -), - -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the jstor country raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'. -This will either point to 'empty_jstor_country' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -empty_jstor_country as ( - SELECT - Cast(null as STRING) as eISBN, - Cast(null as STRING) as ISBN, - CAST(null as Date) as release_date, - Cast(null as STRING) as Book_Title, - Cast(null as STRING) as Book_ID, - Cast(null as STRING) as Authors, - Cast(null as INT64) as Copyright_Year, - Cast(null as STRING) as Disciplines, - Cast(null as STRING) as Usage_Type, - Cast(null as STRING) as Country_Name, - Cast(null as INT64) as Total_Item_Requests -), - -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the google books traffic raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'. -This will either point to 'empty_google_books_traffic' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -empty_google_books_traffic as ( - SELECT - CAST(NULL as STRING) as Primary_ISBN, - CAST(NULL as DATE) as release_date, - CAST(NULL as STRING) as title, - CAST(NULL as INT64) as Book_Visits_BV_, - CAST(NULL as INT64) as Non_Unique_Buy_Clicks, - CAST(NULL as INT64) as BV_with_Buy_Clicks, - CAST(NULL as INT64) as Pages_Viewed, - CAST(NULL as INT64) as BV_with_Pages_Viewed, - CAST(NULL as FLOAT64) as Buy_Link_CTR -), - -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the google books sales raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'. -This will either point to 'empty_google_books_sales' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -empty_google_books_sales as ( - SELECT - CAST(NULL as STRING) as Primary_ISBN, - CAST(NULL as STRING) as Imprint_Name, - CAST(NULL as STRING) as Title, - CAST(NULL as STRING) as Author, - CAST(NULL as DATE) as release_date, - CAST(NULL as INT64) as qty, - CAST(NULL as STRING) as Country_of_Sale -), - -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the google analytics raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_analytics3_table_id'. -This will either point to 'empty_google_analytics3' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -empty_google_analytics3 as ( - SELECT - CAST(null as STRING) as publication_id, - CAST(null as DATE) as release_date, - CAST(null as FLOAT64) as average_time, - CAST(null as STRING) as publication_whole_or_part, - CAST(null as STRING) as publication_type, - CAST(null as STRING) as publication_format, - STRUCT( - [STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value)] as country - ) as {{ ga3_views_field }} -), - -empty_internet_archive as ( - SELECT - CAST(null as STRING) as ISBN13, - CAST(null as STRING) as title, - CAST(null as INTEGER) as value, - CAST(null as DATE) as month, - CAST(null as DATE) as release_date, -), - -empty_worldreader as ( - SELECT - CAST(null as STRING) as ISBN13, - CAST(null as STRING) as title, - CAST(null as STRING) as country_code, - CAST(null as STRING) as country_name, - CAST(null as INTEGER) as value, - CAST(null as DATE) as month, - CAST(null as DATE) as release_date, -), - -{# -The purpose of this block of SQL is to create an empty row of data, which comforms to the UCL Discovery raw data. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'ucl_discovery_table_id'. -This will either point to 'empty_ucl_discovery' (the name of this SQL block) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -empty_ucl_discovery as ( - SELECT - CAST(NULL as STRING) as ISBN, - CAST(NULL as STRING) as eprint_id, - CAST(NULL as DATE) as release_date, - CAST(NULL as INT64) as total_downloads, - [STRUCT(CAST(NULL as INT64) as count, CAST(NULL as STRING) as value)] as country -), - -{# - -#} -empty_work_ids as ( - SELECT - CAST(NULL as STRING) as isbn13, - CAST(NULL as STRING) as work_id, -), - -{# - -#} -empty_work_family_ids as ( - SELECT - CAST(NULL as STRING) as isbn13, - CAST(NULL as STRING) as work_family_id, -), - {# This block of SQL queries data in an ONIX release, and produces a normalised and focused dataset to pass into the next step of this workflow. @@ -484,419 +109,6 @@ onix_ebook_titles as ( ) ), -# Google Analytics -{# -The purpose of this block of SQL is to organise the metrics from google analytics for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_analytics3_table_id'. -This will either point to 'empty_google_analytics3' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -google_analytics3_grouped_metrics AS( - SELECT - publication_id, - release_date, - publication_whole_or_part, - publication_format, - IF - (publication_format = 'PDF' - AND publication_whole_or_part = 'whole', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS pdf_book_country, - IF - (publication_format = 'PDF' - AND publication_whole_or_part = 'part', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS pdf_chapter_country, - IF - (publication_format = 'HTML' - AND publication_whole_or_part = 'whole', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS html_book_country, - IF - (publication_format = 'HTML' - AND publication_whole_or_part = 'part', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS html_chapter_country, - IF - (publication_format = 'EPUB' - AND publication_whole_or_part = 'whole', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS epub_book_country, - IF - (publication_format = 'EPUB' - AND publication_whole_or_part = 'part', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS epub_chapter_country, - IF - (publication_format = 'MOBI' - AND publication_whole_or_part = 'whole', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS mobi_book_country, - IF - (publication_format = 'MOBI' - AND publication_whole_or_part = 'part', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS mobi_chapter_country, - IF - (publication_format IN ('PDF','HTML', 'EPUB', 'MOBI') - AND publication_whole_or_part IN ('whole', 'part'), - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS downloads_total_country, - IF - (publication_whole_or_part = '(citation)', - group_items_google_analytics3(ARRAY_CONCAT_AGG({{ ga3_views_field }}.country)), - ARRAY_AGG(STRUCT(CAST(NULL as STRING) as name, CAST(null as INT64) as value))) AS views_total_country, - FROM - `{{ google_analytics3_table_id }}` - WHERE - publication_type = "book" - GROUP BY - publication_id, - release_date, - publication_whole_or_part, - publication_format ), - -google_analytics3_metrics AS ( - SELECT - publication_id AS ISBN13, - release_date, - STRUCT( ARRAY_CONCAT_AGG(views_total_country) AS views_total_country, - group_items_google_analytics3(ARRAY_CONCAT_AGG(downloads_total_country)) AS downloads_total_country, - ARRAY_CONCAT_AGG(pdf_book_country) AS downloads_pdf_book_country, - ARRAY_CONCAT_AGG(pdf_chapter_country) AS downloads_pdf_chapter_country, - ARRAY_CONCAT_AGG(html_book_country) AS downloads_html_book_country, - ARRAY_CONCAT_AGG(html_chapter_country) AS downloads_html_chapter_country, - ARRAY_CONCAT_AGG(epub_book_country) AS downloads_epub_book_country, - ARRAY_CONCAT_AGG(epub_chapter_country) AS downloads_epub_chapter_country, - ARRAY_CONCAT_AGG(mobi_book_country) AS downloads_mobi_book_country, - ARRAY_CONCAT_AGG(mobi_chapter_country) AS downloads_mobi_chapter_country ) AS metrics - FROM - google_analytics3_grouped_metrics - GROUP BY - publication_id, - release_date ), - -# Google Books Sales Metrics -{# -The purpose of this block of SQL is to organise the metrics from google book sales for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'. -This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -google_books_sales_metrics as ( - SELECT - Primary_ISBN as ISBN13, - release_date, - STRUCT(SUM(qty) as qty, group_items_google_books_sales(ARRAY_AGG(STRUCT(Country_of_Sale, qty))) as countries) as metrics - FROM `{{ google_books_sales_table_id }}` - GROUP BY Primary_ISBN, release_date -), - -# Google Books Sales Metadata -{# -The purpose of this block of SQL is to organise the Metadata from google book sales for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_sales_table_id'. -This will either point to 'empty_google_books_sales' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -google_books_sales_metadata as ( - SELECT - Primary_ISBN as ISBN13, MAX(Imprint_Name) as Imprint_Name, MAX(Title) as Title, MAX(Author) as Author - FROM `{{ google_books_sales_table_id }}` - GROUP BY Primary_ISBN -), - -# Google Books Traffic Metrics -{# -The purpose of this block of SQL is to organise the metrics from google book traffic for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'. -This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -google_books_traffic_metrics as ( - SELECT - Primary_ISBN as ISBN13, - release_date, - STRUCT( - SUM(Book_Visits_BV_) as Book_Visits_BV_, SUM(BV_with_Pages_Viewed) as BV_with_Pages_Viewed, SUM(Non_Unique_Buy_Clicks) as Non_Unique_Buy_Clicks, - SUM(BV_with_Buy_Clicks) as BV_with_Buy_Clicks, SUM(Buy_Link_CTR) as Buy_Link_CTR, SUM(Pages_Viewed) as Pages_Viewed - ) as metrics - FROM `{{ google_books_traffic_table_id }}` - GROUP BY Primary_ISBN, release_date -), - -# Google Books Traffic Metadata -{# -The purpose of this block of SQL is to organise the Metadata from google book traffic for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'google_books_traffic_table_id'. -This will either point to 'empty_google_books_traffic' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -google_books_traffic_metadata as ( - SELECT - Primary_ISBN as ISBN13, MAX(title) as Title - FROM `{{ google_books_traffic_table_id }}` - GROUP BY Primary_ISBN -), - -# JSTOR Country Metircs -{# -The purpose of this block of SQL is to organise the metrics from JSTOR country for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'. -This will either point to 'empty_jstor_country' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -jstor_country_metrics as ( - SELECT - eISBN as ISBN13, release_date, group_items_jstor_country(ARRAY_AGG(STRUCT(Country_name, Total_Item_Requests))) as metrics - FROM `{{ jstor_country_table_id }}` - GROUP BY eISBN, release_date -), - -# JSTOR Country Metadata -{# -The purpose of this block of SQL is to organise the Metadata from JSTOR country for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_table_id'. -This will either point to 'empty_jstor_country' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -jstor_country_metadata as ( - SELECT - eISBN as ISBN13, MAX(Book_Title) as Book_Title, - MAX(Book_ID) as Book_ID, MAX(Authors) as Authors, MAX(ISBN) as ISBN, eISBN, - MAX(Copyright_Year) as Copyright_Year, MAX(Disciplines) as Disciplines, MAX(Usage_Type) as Usage_Type - FROM `{{ jstor_country_table_id }}` - GROUP BY eISBN -), - -# JSTOR Institutions Metircs -{# -The purpose of this block of SQL is to organise the metrics from JSTOR institution for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'. -This will either point to 'empty_jstor_institution' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -jstor_institution_metrics as ( - SELECT - eISBN as ISBN13, release_date, group_items_jstor_institution(ARRAY_AGG(STRUCT(Institution, Total_Item_Requests))) as metrics - FROM `{{ jstor_institution_table_id }}` - GROUP BY eISBN, release_date -), - -# JSTOR Institutions Metadata -{# -The purpose of this block of SQL is to organise the Metadata from JSTOR institution for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_table_id'. -This will either point to 'empty_jstor_institution' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -jstor_institution_metadata as ( - SELECT - eISBN as ISBN13, - MAX(Book_Title) as Book_Title, MAX(Book_ID) as Book_ID, MAX(Authors) as Authors, MAX(ISBN) as ISBN, eISBN, - MAX(Copyright_Year) as Copyright_Year, MAX(Disciplines) as Disciplines, MAX(Usage_Type) as Usage_Type - FROM `{{ jstor_institution_table_id }}` - GROUP BY eISBN -), - -# IRUS OPAEN Metrics -{# -The purpose of this block of SQL is to organise the metrics from IRUS OAPEN for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'oapen_table_id'. -This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -irus_oapen_metrics as ( - SELECT - ISBN as ISBN13, - release_date, - STRUCT( - MAX(version) as version, SUM(title_requests) as title_requests, SUM(total_item_investigations) as total_item_investigations, - SUM(total_item_requests) as total_item_requests, SUM(unique_item_investigations) as unique_item_investigations, - SUM(unique_item_requests) as unique_item_requests, group_items_irus_country(ARRAY_CONCAT_AGG(country)) as country, - group_items_irus_location(ARRAY_CONCAT_AGG(locations)) as locations - ) as metrics - FROM `{{ irus_oapen_table_id }}` - GROUP BY ISBN, release_date -), - -# IRUS OAPEN Metadata -{# -The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_oapen_table_id'. -This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -irus_oapen_metadata as ( - SELECT - ISBN as ISBN13, MAX(book_title) as book_title, MAX(publisher) as publisher - FROM `{{ irus_oapen_table_id }}` - GROUP BY ISBN -), - -# IRUS Fulcrum Metrics -{# -The purpose of this block of SQL is to organise the metrics from IRUS Fulcrum for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'. -This will either point to 'empty_irus_fulcrum' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -irus_fulcrum_metrics as ( - SELECT - ISBN as ISBN13, - release_date, - STRUCT( - SUM(total_item_investigations) as total_item_investigations, - SUM(total_item_requests) as total_item_requests, SUM(unique_item_investigations) as unique_item_investigations, - SUM(unique_item_requests) as unique_item_requests, group_items_irus_fulcrum_country(ARRAY_CONCAT_AGG(country)) as country - ) as metrics - FROM `{{ irus_fulcrum_table_id }}` - GROUP BY ISBN, release_date -), - -# IRUS Fulcrum Metadata -{# -The purpose of this block of SQL is to organise the Metadata from IRUS OAPEN for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'. -This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -irus_fulcrum_metadata as ( - SELECT - ISBN as ISBN13, MAX(book_title) as book_title, MAX(publisher) as publisher - FROM `{{ irus_fulcrum_table_id }}` - GROUP BY ISBN -), - -# UCL Discovery Metrics -{# -The purpose of this block of SQL is to organise the metrics from UCL Discovery for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'ucl_discovery_table_id'. -This will either point to 'empty_ucl_discovery' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enables simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -ucl_discovery_metrics AS ( - SELECT - ISBN as ISBN13, - release_date, - STRUCT( - MAX(total_downloads) AS total_downloads, - ARRAY_AGG(STRUCT( - ucl_country.value AS country_code, - countries.iso_name AS country_name, - ucl_country.count AS country_downloads)) AS country - ) AS metrics - FROM - `{{ ucl_discovery_table_id }}`, - UNNEST(country) AS ucl_country - LEFT JOIN - `{{ country_table_id }}` AS countries - ON - ucl_country.value = countries.alpha2 - GROUP BY - ISBN13, release_date -), - -# UCL Discovery Metadata -{# -The purpose of this block of SQL is to organise the Metadata from UCL Discovery for easier consumption of downstream queries. - -Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'. -This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. -The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. -Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. -#} -ucl_discovery_metadata as ( - SELECT - ISBN as ISBN13, - eprint_id - FROM `{{ ucl_discovery_table_id }}` - GROUP BY ISBN13, eprint_id -), - -# Worldreader Metrics -worldreader_metrics as ( - SELECT - ISBN13, - release_date, - STRUCT( - SUM(value) as total_downloads, - ARRAY_AGG(STRUCT( - country_code, - country_name, - value AS downloads - )) AS country - ) AS metrics - #group_items_worldreader_country(ARRAY_CONCAT(country_code, country_name, value)) as country - #ARRAY_AGG(value) as country_downloads, - #ARRAY_AGG(country_name) as country - FROM `{{ worldreader_table_id }}` - GROUP BY ISBN13, release_date -), - -# Worldreader Metadata -worldreader_metadata as ( - SELECT - ISBN13, - MAX(title) as book_title, - FROM `{{ worldreader_table_id }}` - GROUP BY ISBN13 -), - -# Internet Archive Metrics -internet_archive_metrics as ( - SELECT - ISBN13, - release_date, - STRUCT( - SUM(value) as total_downloads - ) as metrics - FROM `{{ internet_archive_table_id }}` - GROUP BY ISBN13, release_date -), - -# Internet Archive Metadata -internet_archive_metadata as ( - SELECT - ISBN13, - MAX(title) as book_title - FROM `{{ internet_archive_table_id }}` - GROUP BY ISBN13 -), - - -# Crossref Events -{# -The purpose of this block of SQL is to -#} crossref_events as ( SELECT public_data.isbn as ISBN13, @@ -958,40 +170,19 @@ metrics as ( ARRAY_AGG(STRUCT( ebook_months.release_date as month, crossref_events.metrics as crossref_events, - google_analytics.metrics as google_analytics, - google_books_sales.metrics as google_books_sales, - google_books_traffic.metrics as google_books_traffic, - jstor_country.metrics as jstor_country, - jstor_institution.metrics as jstor_institution, - irus_oapen.metrics as irus_oapen, - irus_fulcrum.metrics as irus_fulcrum, - ucl_discovery.metrics as ucl_discovery, - worldreader.metrics as worldreader, - internet_archive.metrics as internet_archive + {% for dp in data_partners %} + {{ dp.type_id + ".metrics" }} AS {{ dp.type_id }} + {% endofor %} ) ORDER BY ebook_months.release_date DESC) as months FROM ebook_months - LEFT JOIN google_analytics3_metrics as google_analytics ON ebook_months.ISBN13 = google_analytics.ISBN13 AND ebook_months.release_date = google_analytics.release_date - LEFT JOIN google_books_sales_metrics as google_books_sales ON ebook_months.ISBN13 = google_books_sales.ISBN13 AND ebook_months.release_date = google_books_sales.release_date - LEFT JOIN google_books_traffic_metrics as google_books_traffic ON ebook_months.ISBN13 = google_books_traffic.ISBN13 AND ebook_months.release_date = google_books_traffic.release_date - LEFT JOIN jstor_country_metrics as jstor_country ON ebook_months.ISBN13 = jstor_country.ISBN13 AND ebook_months.release_date = jstor_country.release_date - LEFT JOIN jstor_institution_metrics as jstor_institution ON ebook_months.ISBN13 = jstor_institution.ISBN13 AND ebook_months.release_date = jstor_institution.release_date - LEFT JOIN irus_oapen_metrics as irus_oapen ON ebook_months.ISBN13 = irus_oapen.ISBN13 AND ebook_months.release_date = irus_oapen.release_date - LEFT JOIN irus_fulcrum_metrics as irus_fulcrum ON ebook_months.ISBN13 = irus_fulcrum.ISBN13 AND ebook_months.release_date = irus_fulcrum.release_date - LEFT JOIN ucl_discovery_metrics as ucl_discovery ON ebook_months.ISBN13 = ucl_discovery.ISBN13 AND ebook_months.release_date = ucl_discovery.release_date - LEFT JOIN worldreader_metrics as worldreader ON ebook_months.ISBN13 = worldreader.ISBN13 AND ebook_months.release_date = worldreader.release_date - LEFT JOIN internet_archive_metrics as internet_archive ON ebook_months.ISBN13 = internet_archive.ISBN13 AND ebook_months.release_date = internet_archive.release_date + {% for dp in data_partners %} + LEFT JOIN {{ dp.type_id + "_metrics" }} AS {{ dp.type_id }} ON ebook_months.ISBN13 = {{ dp.type_id + ".ISBN13" }} AND ebook_months.release_date = {{ dp.type_id + ".release_date" }} + {% endfor %} LEFT JOIN crossref_events as crossref_events ON ebook_months.ISBN13 = crossref_events.ISBN13 AND ebook_months.release_date = crossref_events.release_date - WHERE google_analytics.metrics IS NOT NULL - OR google_books_sales.metrics IS NOT NULL - OR google_books_traffic.metrics IS NOT NULL - OR jstor_country.metrics IS NOT NULL - OR jstor_institution.metrics IS NOT NULL - OR irus_oapen.metrics IS NOT NULL - OR irus_fulcrum.metrics IS NOT NULL - OR ucl_discovery.metrics IS NOT NULL - OR worldreader.metrics IS NOT NULL - OR internet_archive.metrics IS NOT NULL - OR crossref_events.metrics IS NOT NULL + WHERE crossref_events.metrics IS NOT NULL + {% for dp in data_partners %} + OR {{ dp.type_id + ".metrics" }} IS NOT NULL + {% endfor %} GROUP BY ebook_months.ISBN13 ) @@ -1007,24 +198,17 @@ SELECT empty_work_ids.work_id, empty_work_family_ids.work_family_id, STRUCT( - crossref_objects, chapters, events.overall as events, google_books_sales_metadata as google_books_sales, - google_books_traffic_metadata as google_books_traffic, jstor_country_metadata as jstor_metadata, - jstor_institution_metadata as jstor_institution_metadata, irus_oapen_metadata as irus_oapen_metadata, - irus_fulcrum_metadata as irus_fulcrum_metadata, ucl_discovery_metadata as ucl_discovery_metadata, - worldreader_metadata as worldreader_metadata, internet_archive_metadata as internet_archive_metadata + crossref_objects, chapters, events.overall as events, + {% for dp in data_partners %} + {{ dp.type_id + "_metadata" }}, + {% endfor %} ) as metadata, metrics.months FROM onix_ebook_titles -LEFT JOIN {% if onix_workflow %} `{{ workid_table_id }}` {% else %} empty_work_ids {% endif %} as empty_work_ids on empty_work_ids.isbn13 = onix_ebook_titles.isbn13 -LEFT JOIN {% if onix_workflow %} `{{ workfamilyid_table_id }}` {% else %} empty_work_family_ids {% endif %} as empty_work_family_ids on empty_work_family_ids.isbn13 = onix_ebook_titles.isbn13 -LEFT JOIN metrics as metrics on metrics.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN google_books_sales_metadata on google_books_sales_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN google_books_traffic_metadata on google_books_traffic_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN jstor_country_metadata on jstor_country_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN jstor_institution_metadata on jstor_institution_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN irus_oapen_metadata on irus_oapen_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN irus_fulcrum_metadata on irus_fulcrum_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN ucl_discovery_metadata on ucl_discovery_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN worldreader_metadata on worldreader_metadata.ISBN13 = onix_ebook_titles.ISBN13 -LEFT JOIN internet_archive_metadata on internet_archive_metadata.ISBN13 = onix_ebook_titles.ISBN13 +LEFT JOIN `{{ workid_table_id }}` ON `{{ workid_table_id }}`.isbn13 = onix_ebook_titles.ISBN13 +LEFT JOIN `{{ workfamilyid_table_id }}` ON `{{ workfamilyid_table_id }}`.isbn13 = onix_ebook_titles.ISBN13 +LEFT JOIN metrics AS metrics ON metrics.ISBN13 = onix_ebook_titles.ISBN13 +{% for dp in data_partners %} + LEFT JOIN {{ dp.type_id + "_metadata" }} ON {{ dp.type_id + "_metadata.ISBN13" }} = onix_ebook_titles.ISBN13 +{% endfor %} LEFT JOIN `{{ book_table_id }}` as public_data on public_data.isbn = onix_ebook_titles.ISBN13; diff --git a/oaebu_workflows/sql/internet_archive/bp_body_internet_archive.sql.jinja2 b/oaebu_workflows/sql/internet_archive/bp_body_internet_archive.sql.jinja2 new file mode 100644 index 00000000..f4a65abf --- /dev/null +++ b/oaebu_workflows/sql/internet_archive/bp_body_internet_archive.sql.jinja2 @@ -0,0 +1,23 @@ +internet_archive_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT( + SUM(value) as total_downloads + ) as metrics + FROM + `{{ internet_archive_table_id }}` + GROUP BY + ISBN13, + release_date +), +# Internet Archive Metadata +internet_archive_metadata as ( + SELECT + ISBN13, + MAX(title) as book_title + FROM + `{{ internet_archive_table_id }}` + GROUP BY + ISBN13 +), \ No newline at end of file diff --git a/oaebu_workflows/sql/worldreader/bp_body_worldreader.sql.jinja2 b/oaebu_workflows/sql/worldreader/bp_body_worldreader.sql.jinja2 new file mode 100644 index 00000000..b7d1c2ef --- /dev/null +++ b/oaebu_workflows/sql/worldreader/bp_body_worldreader.sql.jinja2 @@ -0,0 +1,32 @@ +worldreader_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT( + SUM(value) as total_downloads, + ARRAY_AGG( + STRUCT( + country_code, + country_name, + value AS downloads + ) + ) AS country + ) AS metrics #group_items_worldreader_country(ARRAY_CONCAT(country_code, country_name, value)) as country + #ARRAY_AGG(value) as country_downloads, + #ARRAY_AGG(country_name) as country + FROM + `{{ worldreader_table_id }}` + GROUP BY + ISBN13, + release_date +), +# Worldreader Metadata +worldreader_metadata as ( + SELECT + ISBN13, + MAX(title) as book_title, + FROM + `{{ worldreader_table_id }}` + GROUP BY + ISBN13 +), \ No newline at end of file diff --git a/oaebu_workflows/sql/worldreader/bp_functions_worldreader.sql b/oaebu_workflows/sql/worldreader/bp_functions_worldreader.sql new file mode 100644 index 00000000..02b0bf6d --- /dev/null +++ b/oaebu_workflows/sql/worldreader/bp_functions_worldreader.sql @@ -0,0 +1,18 @@ +Output Schema: name STRING NULLABLE code STRING NULLABLE name INTEGER NULLABLE downloads INTEGER NULLABLE CREATE TEMP FUNCTION group_items_worldreader_country( + items ARRAY < STRUCT < country_name STRING, + code STRING, + downloads INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT country_name, + MAX(code) as country_code, + SUM(downloads) as downloads + FROM + UNNEST(items) + GROUP BY + country_name + ) + ) +); \ No newline at end of file diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/bp_body_ucl_discovery.sql.jinja2 b/oaebu_workflows/ucl_discovery_telescope/sql/bp_body_ucl_discovery.sql.jinja2 new file mode 100644 index 00000000..b82d9428 --- /dev/null +++ b/oaebu_workflows/ucl_discovery_telescope/sql/bp_body_ucl_discovery.sql.jinja2 @@ -0,0 +1,42 @@ +-- The purpose of this block of SQL is to organise the metrics from UCL Discovery for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'ucl_discovery_table_id'. +-- This will either point to 'empty_ucl_discovery' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enables simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +ucl_discovery_metrics AS ( + SELECT + ISBN as ISBN13, + release_date, + STRUCT( + MAX(total_downloads) AS total_downloads, + ARRAY_AGG( + STRUCT( + ucl_country.value AS country_code, + countries.iso_name AS country_name, + ucl_country.count AS country_downloads + ) + ) AS country + ) AS metrics + FROM + `{{ ucl_discovery_table_id }}`, + UNNEST(country) AS ucl_country + LEFT JOIN `{{ country_table_id }}` AS countries ON ucl_country.value = countries.alpha2 + GROUP BY + ISBN13, + release_date +), +-- The purpose of this block of SQL is to organise the Metadata from UCL Discovery for easier consumption of downstream queries. +-- Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'irus_fulcrum_table_id'. +-- This will either point to 'empty_irus_oapen' (defined above as an empty row) or the name of the real data table in bigquery. +-- The reason for the choice of selecting an empty row, is that some partners will not have corresponding data to query. +-- Providng an empty row enable simplicity of the downstream queries and also means the resulting schema across all publishers is the same. +ucl_discovery_metadata as ( + SELECT + ISBN as ISBN13, + eprint_id + FROM + `{{ ucl_discovery_table_id }}` + GROUP BY + ISBN13, + eprint_id +), \ No newline at end of file