From 026055f9e6225362e704568e0190801bf484cf07 Mon Sep 17 00:00:00 2001 From: Keegan Smith Date: Fri, 24 May 2024 14:02:32 +0800 Subject: [PATCH] Feature/bespoke data partners (#231) Co-authored-by: keegansmith21 --- dags/oaebu_workflows/oaebu_partners.py | 61 ++++++-- .../onix_workflow/onix_workflow.py | 143 +++++++++++------- .../book_metrics_altmetrics_pilot.json | 14 ++ .../book_metrics_amazon_ltd_pilot.json | 14 ++ .../shmp_pilot/book_metrics_amazon_pilot.json | 14 ++ ...ok_metrics_author_jstor_country_pilot.json | 15 ++ ...ook_metrics_author_muse_country_pilot.json | 14 ++ ...k_metrics_country_jstor_country_pilot.json | 15 ++ ...ok_metrics_country_muse_country_pilot.json | 15 ++ .../book_metrics_internet_archive_pilot.json | 14 ++ .../book_metrics_jstor_country_pilot.json | 15 ++ .../book_metrics_muse_country_pilot.json | 15 ++ .../book_metrics_muse_metrics_pilot.json | 14 ++ .../shmp_pilot/book_metrics_oapen_pilot.json | 14 ++ ...book_metrics_scholarcommons_ltd_pilot.json | 14 ++ ..._metrics_scholarspace_downloads_pilot.json | 14 ++ ...book_metrics_scholarspace_views_pilot.json | 14 ++ .../book_metrics_sci_open_pilot.json | 14 ++ ...k_metrics_subject_jstor_country_pilot.json | 15 ++ ..._product_metadata_jstor_country_pilot.json | 62 ++++++++ ...duct_metadata_jstor_institution_pilot.json | 62 ++++++++ ...k_product_metadata_muse_country_pilot.json | 39 +++++ ...book_product_metrics_altmetrics_pilot.json | 14 ++ ...book_product_metrics_amazon_ltd_pilot.json | 14 ++ .../book_product_metrics_amazon_pilot.json | 14 ++ ...roduct_metrics_internet_archive_pilot.json | 14 ++ ...k_product_metrics_jstor_country_pilot.json | 20 +++ ...oduct_metrics_jstor_institution_pilot.json | 20 +++ ...ok_product_metrics_muse_country_pilot.json | 21 +++ ...ok_product_metrics_muse_metrics_pilot.json | 14 ++ .../book_product_metrics_oapen_pilot.json | 14 ++ ...duct_metrics_scholarcommons_ltd_pilot.json | 14 ++ ..._metrics_scholarspace_downloads_pilot.json | 14 ++ ...duct_metrics_scholarspace_views_pilot.json | 14 ++ .../book_product_metrics_sci_open_pilot.json | 14 ++ .../schema/shmp_pilot/generic.json | 26 ++++ .../schema/shmp_pilot/muse_country.json | 122 +++++++++++++++ .../book_metrics_altmetrics_pilot.sql | 1 + .../book_metrics_amazon_ltd_pilot.sql | 1 + .../shmp_pilot/book_metrics_amazon_pilot.sql | 1 + ...ook_metrics_author_jstor_country_pilot.sql | 1 + ...book_metrics_author_muse_country_pilot.sql | 1 + ...ountry_body_jstor_country_pilot.sql.jinja2 | 11 ++ ...country_body_muse_country_pilot.sql.jinja2 | 10 ++ ...trics_country_join_jstor_country_pilot.sql | 6 + ...etrics_country_join_muse_country_pilot.sql | 4 + ...trics_country_null_jstor_country_pilot.sql | 1 + ...etrics_country_null_muse_country_pilot.sql | 1 + ...ics_country_struct_jstor_country_pilot.sql | 1 + ...rics_country_struct_muse_country_pilot.sql | 1 + .../book_metrics_internet_archive_pilot.sql | 1 + .../book_metrics_jstor_country_pilot.sql | 8 + .../book_metrics_muse_country_pilot.sql | 8 + .../book_metrics_muse_metrics_pilot.sql | 1 + .../shmp_pilot/book_metrics_oapen_pilot.sql | 1 + .../book_metrics_scholarcommons_ltd_pilot.sql | 1 + ...k_metrics_scholarspace_downloads_pilot.sql | 1 + .../book_metrics_scholarspace_views_pilot.sql | 1 + .../book_metrics_sci_open_pilot.sql | 1 + ...k_product_body_altmetrics_pilot.sql.jinja2 | 11 ++ ...k_product_body_amazon_ltd_pilot.sql.jinja2 | 11 ++ .../book_product_body_amazon_pilot.sql.jinja2 | 11 ++ ...uct_body_internet_archive_pilot.sql.jinja2 | 11 ++ ...roduct_body_jstor_country_pilot.sql.jinja2 | 34 +++++ ...ct_body_jstor_institution_pilot.sql.jinja2 | 34 +++++ ...product_body_muse_country_pilot.sql.jinja2 | 25 +++ ...product_body_muse_metrics_pilot.sql.jinja2 | 11 ++ .../book_product_body_oapen_pilot.sql.jinja2 | 11 ++ ...t_body_scholarcommons_ltd_pilot.sql.jinja2 | 11 ++ ...dy_scholarspace_downloads_pilot.sql.jinja2 | 11 ++ ...t_body_scholarspace_views_pilot.sql.jinja2 | 11 ++ ...ook_product_body_sci_open_pilot.sql.jinja2 | 11 ++ ..._product_functions_jstor_country_pilot.sql | 19 +++ ...duct_functions_jstor_institution_pilot.sql | 19 +++ ...k_product_functions_muse_country_pilot.sql | 19 +++ .../month_metrics_sum_jstor_country_pilot.sql | 3 + .../month_metrics_sum_muse_country_pilot.sql | 3 + .../month_null_altmetrics_pilot.sql | 1 + .../month_null_amazon_ltd_pilot.sql | 1 + .../shmp_pilot/month_null_amazon_pilot.sql | 1 + .../month_null_internet_archive_pilot.sql | 1 + .../month_null_jstor_country_pilot.sql | 1 + .../month_null_muse_country_pilot.sql | 1 + .../month_null_muse_metrics_pilot.sql | 1 + .../sql/shmp_pilot/month_null_oapen_pilot.sql | 1 + .../month_null_scholarcommons_ltd_pilot.sql | 1 + ...onth_null_scholarspace_downloads_pilot.sql | 1 + .../month_null_scholarspace_views_pilot.sql | 1 + .../shmp_pilot/month_null_sci_open_pilot.sql | 1 + tests/test_oaebu_partners.py | 63 +++++++- 90 files changed, 1297 insertions(+), 67 deletions(-) create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_altmetrics_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_ltd_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_jstor_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_muse_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_jstor_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_muse_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_internet_archive_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_jstor_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_metrics_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_oapen_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_downloads_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_views_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_sci_open_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_metrics_subject_jstor_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_institution_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_muse_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_altmetrics_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_ltd_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_internet_archive_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_institution_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_country_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_metrics_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_oapen_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarcommons_ltd_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_downloads_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_views_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_sci_open_pilot.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/generic.json create mode 100644 dags/oaebu_workflows/schema/shmp_pilot/muse_country.json create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_altmetrics_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_ltd_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_jstor_country_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_muse_country_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_internet_archive_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_metrics_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_oapen_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_downloads_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_views_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_metrics_sci_open_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_altmetrics_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_ltd_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_internet_archive_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_country_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_institution_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_country_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_metrics_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_oapen_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarcommons_ltd_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_downloads_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_views_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_body_sci_open_pilot.sql.jinja2 create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_institution_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_altmetrics_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_ltd_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_internet_archive_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_jstor_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_country_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_metrics_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_oapen_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarcommons_ltd_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_downloads_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_views_pilot.sql create mode 100644 dags/oaebu_workflows/sql/shmp_pilot/month_null_sci_open_pilot.sql diff --git a/dags/oaebu_workflows/oaebu_partners.py b/dags/oaebu_workflows/oaebu_partners.py index 4d8e61b6..c35abcb0 100644 --- a/dags/oaebu_workflows/oaebu_partners.py +++ b/dags/oaebu_workflows/oaebu_partners.py @@ -1,4 +1,3 @@ -# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +15,8 @@ # Author: Tuan Chien, Keegan Smith import os -from typing import Union +import inspect +from typing import Union, List, Dict from dataclasses import dataclass from oaebu_workflows.config import schema_folder, sql_folder @@ -113,11 +113,11 @@ def __init__( schema_directory: str, sql_directory: str, book_product_functions: str, - export_author: bool, - export_book_metrics: bool, - export_country: bool, - export_subject: bool, - has_metdata: bool = True, + export_author: bool = False, + export_book_metrics: bool = False, + export_country: bool = False, + export_subject: bool = False, + has_metadata: bool = True, ): """ Initialises the class. Also uses the DataPartnerFiles class to set up the file names. @@ -129,7 +129,7 @@ def __init__( :param export_book_metrics: Indicates if the partner will use the book metrics export table. :param export_country: Indicates if the partner will use the country export table. :param export_subject: Indicates if the partner will use the subject export tables (bic, bisac, thema). - :param has_metdata: Whether the partner has book metadata records + :param has_metadata: Whether the partner has book metadata records """ super().__init__( type_id=type_id, @@ -147,7 +147,7 @@ def __init__( self.export_book_metrics = export_book_metrics self.export_country = export_country self.export_subject = export_subject - self.has_metadata = has_metdata + self.has_metadata = has_metadata self.files = DataPartnerFiles(partner_name=self.type_id) @@ -206,7 +206,7 @@ def __init__( export_book_metrics=True, export_country=True, export_subject=True, - has_metdata=False, + has_metadata=False, ), google_books_sales=DataPartner( type_id="google_books_sales", @@ -407,3 +407,44 @@ def partner_from_str(partner: Union[str, OaebuPartner], metadata_partner: bool = raise KeyError(f"Partner not found: {partner}").with_traceback(e.__traceback__) return partner + + +def create_bespoke_data_partners(partners: List[Dict]) -> List[DataPartner]: + return [create_bespoke_data_partner(p) for p in partners] + + +def create_bespoke_data_partner(partner_dict: dict) -> DataPartner: + """Converts a dictionary description of a partner to a DataPartner object + + :param partner: The dictionary description of the partner. Should have all required fields as per the DataPartner + object description + """ + + # Get the required parameters of the DataPartner class + # https://docs.python.org/3/library/inspect.html#inspect.Parameter.kind + required_params = [] + dp_params = inspect.signature(DataPartner.__init__).parameters + for k, v in dp_params.items(): + if k == "self": # Ignore the "self" parameter + continue + if v.default is v.empty: + required_params.append(k) + + present_params = list(partner_dict.keys()) + missing_params = [] + for p in required_params: + if p not in present_params: + missing_params.append(p) + if missing_params: + raise NameError(f"Missing required parameters for DataPartner class: {missing_params}") + + undefined_params = [] + all_params = list(k for k in dp_params.keys() if k != "self") + print(all_params) + for p in partner_dict.keys(): + if p not in all_params: + undefined_params.append(p) + if undefined_params: + raise NameError(f"Unrecognised arguments supplied to DataPartner class: {undefined_params}") + + return DataPartner(**partner_dict) diff --git a/dags/oaebu_workflows/onix_workflow/onix_workflow.py b/dags/oaebu_workflows/onix_workflow/onix_workflow.py index 30d51826..4be43daa 100644 --- a/dags/oaebu_workflows/onix_workflow/onix_workflow.py +++ b/dags/oaebu_workflows/onix_workflow/onix_workflow.py @@ -19,7 +19,7 @@ import logging import os import re -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Tuple, Union, Dict import jsonlines import pendulum @@ -35,7 +35,7 @@ from oaebu_workflows.airflow_pools import CrossrefEventsPool from oaebu_workflows.config import oaebu_user_agent_header, schema_folder as default_schema_folder, sql_folder -from oaebu_workflows.oaebu_partners import DataPartner, OaebuPartner, partner_from_str +from oaebu_workflows.oaebu_partners import DataPartner, OaebuPartner, partner_from_str, create_bespoke_data_partners from oaebu_workflows.onix_workflow.onix_work_aggregation import BookWorkAggregator, BookWorkFamilyAggregator from observatory_platform.airflow.airflow import on_failure_callback from observatory_platform.airflow.release import make_snapshot_date, set_task_state, SnapshotRelease @@ -233,10 +233,10 @@ def create_dag( bq_worksid_table_name: str = "onix_workid_isbn", bq_worksid_error_table_name: str = "onix_workid_isbn_errors", bq_workfamilyid_table_name: str = "onix_workfamilyid_isbn", - oaebu_intermediate_match_suffix: str = "_matched", skip_downloading_crossref_events: bool = True, # Run parameters data_partners: List[Union[str, OaebuPartner]] = None, + bespoke_data_partners: List[Dict] = None, ga3_views_field="page_views", schema_folder: str = default_schema_folder(workflow_module="onix_workflow"), mailto: str = "agent@observatory.academy", @@ -278,11 +278,11 @@ def create_dag( :param bq_worksid_table_name: table ID of the worksid table :param bq_worksid_error_table_name: table ID of the worksid error table :param bq_workfamilyid_table_name: table ID of the workfamilyid table - :param oaebu_intermediate_match_suffix: Suffix to append to intermediate tables :param skip_downloading_crossref_events: skip fetching new data for Crossref Events, when True, falls back to using a previous version of the table. :param data_partners: OAEBU data sources. + :param bespoke_data_partners: Dictionary-defined OAEBU Data Partners :param ga3_views_field: The name of the GA3 views field - should be either 'page_views' or 'unique_views' :param schema_folder: the SQL schema path. :param mailto: email address used to identify the user when sending requests to an API. @@ -306,6 +306,8 @@ def create_dag( metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) data_partners = [partner_from_str(p) for p in data_partners] + if bespoke_data_partners: + data_partners.extend(create_bespoke_data_partners(bespoke_data_partners)) # Create pool for crossref API calls (if they don't exist) # Pools are necessary to throttle the maxiumum number of requests we can make per second and avoid 429 errors @@ -590,39 +592,26 @@ def create_book_table(release: dict, **context) -> None: set_task_state(status, context["ti"].task_id, release=release) @task_group(group_id="intermediate_tables") - def create_tasks_intermediate_tables(release: dict): + def _create_tasks_intermediate_tables(release: dict): tasks = [] for data_partner in data_partners: - task = create_intermediate_table.override(task_id=f"intermediate_{data_partner.bq_table_name}")( - release, - orig_project_id=cloud_workspace.project_id, - orig_dataset=data_partner.bq_dataset_id, - orig_table=data_partner.bq_table_name, - orig_isbn=data_partner.isbn_field_name, - sharded=data_partner.sharded, + task = _create_intermediate_table.override(task_id=f"intermediate_{data_partner.type_id}")( + release, data_partner=data_partner ) tasks.append(task) chain(tasks) @task() - def create_intermediate_table( + def _create_intermediate_table( release: dict, *, - orig_project_id: str, - orig_dataset: str, - orig_table: str, - orig_isbn: str, - sharded: bool, + data_partner: DataPartner, **context, ) -> None: - """Create an intermediate oaebu table. They are of the form datasource_matched + """Create an intermediate oaebu table. They are of the form type_id_matched :param release: Onix workflow release information. - :param orig_project_id: Project ID for the partner data. - :param orig_dataset: Dataset ID for the partner data. - :param orig_table: Table ID for the partner data. - :param orig_isbn: Name of the ISBN field in the partner data table. - :param sharded: Whether the data partner table is sharded + :param data_partner: The data partner object for the data source. """ release = OnixWorkflowRelease.from_dict(release) @@ -632,44 +621,27 @@ def create_intermediate_table( location=cloud_workspace.data_location, description="Intermediate OAEBU Tables", ) - orig_table_id = ( - bq_sharded_table_id(orig_project_id, orig_dataset, orig_table, release.snapshot_date) - if sharded - else bq_table_id(orig_project_id, orig_dataset, orig_table) - ) - output_table_name = f"{orig_table}{oaebu_intermediate_match_suffix}" - template_path = os.path.join( - sql_folder(workflow_module="onix_workflow"), "assign_workid_workfamilyid.sql.jinja2" - ) - output_table_id = bq_sharded_table_id( - cloud_workspace.project_id, - bq_oaebu_intermediate_dataset, - output_table_name, - release.snapshot_date, - ) - wid_table_id = bq_sharded_table_id( + bq_worksid_table_id = bq_sharded_table_id( cloud_workspace.project_id, bq_onix_workflow_dataset, bq_worksid_table_name, release.snapshot_date, ) - wfam_table_id = bq_sharded_table_id( + bq_workfamily_table_id = bq_sharded_table_id( cloud_workspace.project_id, bq_onix_workflow_dataset, bq_workfamilyid_table_name, release.snapshot_date, ) - # Make the table from SQL query - client = Client(project=cloud_workspace.project_id) - sql = render_template( - template_path, - orig_table_id=orig_table_id, - orig_isbn=orig_isbn, - wid_table_id=wid_table_id, - wfam_table_id=wfam_table_id, + status = create_intermediate_table( + data_partner=data_partner, + project_id=cloud_workspace.project_id, + bq_output_dataset=bq_oaebu_intermediate_dataset, + bq_worksid_table_id=bq_worksid_table_id, + bq_workfamily_table_id=bq_workfamily_table_id, + snapshot_date=release.snapshot_date, ) - status = bq_create_table_from_query(sql=sql, table_id=output_table_id, client=client) set_task_state(status, context["ti"].task_id, release=release) @task() @@ -686,11 +658,11 @@ def create_book_product_table(release: dict, **context) -> None: # Data partner table names dp_tables = { - f"{dp.type_id}_table_id": bq_sharded_table_id( - cloud_workspace.project_id, - bq_oaebu_intermediate_dataset, - f"{dp.type_id}_matched", - release.snapshot_date, + f"{dp.type_id}_table_id": bq_intermediate_table_id( + project_id=cloud_workspace.project_id, + bq_intermediate_dataset=bq_oaebu_intermediate_dataset, + dp_type_id=dp.type_id, + snapshot_date=release.snapshot_date, ) for dp in data_partners } @@ -1056,7 +1028,7 @@ def cleanup_workflow(release: dict, **context): task_create_crossref_metadata_table = create_crossref_metadata_table(xcom_release) task_create_crossref_events_table = create_crossref_events_table(xcom_release) task_create_book_table = create_book_table(xcom_release) - task_group_create_intermediate_tables = create_tasks_intermediate_tables(xcom_release) + task_group_create_intermediate_tables = _create_tasks_intermediate_tables(xcom_release) task_create_book_product_table = create_book_product_table(xcom_release) task_group_create_export_tables = create_tasks_export_tables(xcom_release) task_update_latest_export_tables = update_latest_export_tables(xcom_release) @@ -1082,6 +1054,65 @@ def cleanup_workflow(release: dict, **context): return onix_workflow() +def create_intermediate_table( + data_partner: DataPartner, + project_id: str, + bq_output_dataset: str, + bq_worksid_table_id: str, + bq_workfamily_table_id: str, + snapshot_date: str, +) -> bool: + """Create an intermediate oaebu table from a data partner. + + :param data_partner: The data partner object for the data source. + :param project_id: Project ID for the partner data. + :param bq_worksid_table_id: The ID of the worksid table + :param bq_workfamily_table_id: The ID of the workfamily table + :param snapshot_date: The snapshot date to use + :param bq_output_dataset: The name of the dataset to output the resulting table + :return: Whether the operation was successful + """ + + if data_partner.sharded: + orig_table_id = bq_sharded_table_id( + project_id, data_partner.bq_dataset_id, data_partner.bq_table_name, snapshot_date + ) + else: + orig_table_id = bq_table_id(project_id, data_partner.bq_dataset_id, data_partner.bq_table_name) + output_table_id = bq_intermediate_table_id( + project_id=project_id, + bq_intermediate_dataset=bq_output_dataset, + dp_type_id=data_partner.type_id, + snapshot_date=snapshot_date, + ) + template_path = os.path.join(sql_folder(workflow_module="onix_workflow"), "assign_workid_workfamilyid.sql.jinja2") + + # Make the table from SQL query + client = Client(project=project_id) + sql = render_template( + template_path, + orig_table_id=orig_table_id, + orig_isbn=data_partner.isbn_field_name, + wid_table_id=bq_worksid_table_id, + wfam_table_id=bq_workfamily_table_id, + ) + status = bq_create_table_from_query(sql=sql, table_id=output_table_id, client=client) + return status + + +def bq_intermediate_table_id(project_id: str, bq_intermediate_dataset: str, dp_type_id: str, snapshot_date: str) -> str: + """ + Generates a table ID for a given data partner type ID + + :param project_id: The project ID + :param dp_type_id: The type ID of the data partner + :param snapshot_date: The stringified snapshot date + :param bq_intermediate_dataset: The 'intermediate' dateset name + :return: The data partner's intermediate table ID + """ + return bq_sharded_table_id(project_id, bq_intermediate_dataset, f"{dp_type_id}_matched", snapshot_date) + + def get_doi_prefixes(dois: Iterable[str]) -> set[str]: """Convert DOIs to a set of unique DOI prefixes. diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_altmetrics_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_altmetrics_pilot.json new file mode 100644 index 00000000..df9b2505 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_altmetrics_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "altmetrics_pilot", + "type": "RECORD", + "description": "Metrics derived from Altmetrics" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_ltd_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_ltd_pilot.json new file mode 100644 index 00000000..09b1ec98 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_ltd_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "amazon_ltd_pilot", + "type": "RECORD", + "description": "Metrics derived from Amazon LTD" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_pilot.json new file mode 100644 index 00000000..8c12ae2a --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_amazon_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "amazon_pilot", + "type": "RECORD", + "description": "Metrics derived from Amazon" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_jstor_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_jstor_country_pilot.json new file mode 100644 index 00000000..91deb4cc --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_jstor_country_pilot.json @@ -0,0 +1,15 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "The total number of item requests" + } + ], + "mode": "NULLABLE", + "name": "jstor_country_pilot", + "type": "RECORD", + "description": "Metrics from JSTOR" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_muse_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_muse_country_pilot.json new file mode 100644 index 00000000..4a3a1a9e --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_author_muse_country_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "The total number of item requests" + } + ], + "mode": "NULLABLE", + "name": "muse_country_pilot", + "type": "RECORD", + "description": "Metrics from Muse Country" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_jstor_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_jstor_country_pilot.json new file mode 100644 index 00000000..0a2771c4 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_jstor_country_pilot.json @@ -0,0 +1,15 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of request made" + } + ], + "mode": "NULLABLE", + "name": "jstor_country_pilot", + "type": "RECORD", + "description": "Metrics from JSTOR" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_muse_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_muse_country_pilot.json new file mode 100644 index 00000000..a4b80ef4 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_country_muse_country_pilot.json @@ -0,0 +1,15 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of request made" + } + ], + "mode": "NULLABLE", + "name": "muse_country_pilot", + "type": "RECORD", + "description": "Metrics from Muse Country" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_internet_archive_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_internet_archive_pilot.json new file mode 100644 index 00000000..4373349c --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_internet_archive_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "internet_archive_pilot", + "type": "RECORD", + "description": "Metrics derived from Internet Archive" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_jstor_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_jstor_country_pilot.json new file mode 100644 index 00000000..df912d9c --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_jstor_country_pilot.json @@ -0,0 +1,15 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of item requests" + } + ], + "mode": "NULLABLE", + "name": "jstor_country_pilot", + "type": "RECORD", + "description": "Metrics from JSTOR" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_country_pilot.json new file mode 100644 index 00000000..7554193c --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_country_pilot.json @@ -0,0 +1,15 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of item requests" + } + ], + "mode": "NULLABLE", + "name": "muse", + "type": "RECORD", + "description": "Metrics from Muse" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_metrics_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_metrics_pilot.json new file mode 100644 index 00000000..39a6ff28 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_muse_metrics_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "muse_metrics_pilot", + "type": "RECORD", + "description": "Metrics derived from Muse" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_oapen_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_oapen_pilot.json new file mode 100644 index 00000000..84be5161 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_oapen_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "oapen_pilot", + "type": "RECORD", + "description": "Metrics derived from OAPEN" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.json new file mode 100644 index 00000000..b5648369 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "scholarcommons_ltd_pilot", + "type": "RECORD", + "description": "Metrics derived from Scholarcommons LTD" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_downloads_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_downloads_pilot.json new file mode 100644 index 00000000..1c5e564f --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_downloads_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "scholarspace_downloads_pilot", + "type": "RECORD", + "description": "Metrics derived from Scholarspace Downloads" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_views_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_views_pilot.json new file mode 100644 index 00000000..e0802f3e --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_scholarspace_views_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "scholarspace_views_pilot", + "type": "RECORD", + "description": "Metrics derived from Scholarspace Views" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_sci_open_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_sci_open_pilot.json new file mode 100644 index 00000000..21d495b1 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_sci_open_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "sci_open_pilot", + "type": "RECORD", + "description": "Metrics derived from Sci Open" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_subject_jstor_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_subject_jstor_country_pilot.json new file mode 100644 index 00000000..0a2771c4 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_metrics_subject_jstor_country_pilot.json @@ -0,0 +1,15 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of request made" + } + ], + "mode": "NULLABLE", + "name": "jstor_country_pilot", + "type": "RECORD", + "description": "Metrics from JSTOR" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_country_pilot.json new file mode 100644 index 00000000..d6ecf5dc --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_country_pilot.json @@ -0,0 +1,62 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "ISBN13", + "type": "STRING", + "description": "ISBN of the book (13 digits)" + }, + { + "mode": "NULLABLE", + "name": "Book_Title", + "type": "STRING", + "description": "Title of the book" + }, + { + "mode": "NULLABLE", + "name": "Book_ID", + "type": "STRING", + "description": "DOI of the book on JSTOR" + }, + { + "mode": "NULLABLE", + "name": "Authors", + "type": "STRING", + "description": "Author of the book" + }, + { + "mode": "NULLABLE", + "name": "ISBN", + "type": "STRING", + "description": "ISBN of the book" + }, + { + "mode": "NULLABLE", + "name": "eISBN", + "type": "STRING", + "description": "ISBN of the digital version of the book (13 digits)" + }, + { + "mode": "NULLABLE", + "name": "Copyright_Year", + "type": "INTEGER", + "description": "Publication year" + }, + { + "mode": "NULLABLE", + "name": "Disciplines", + "type": "STRING", + "description": "Subject category of the book" + }, + { + "mode": "NULLABLE", + "name": "Usage_Type", + "type": "STRING", + "description": "For our case it is Open Access" + } + ], + "mode": "NULLABLE", + "name": "jstor_country_pilot", + "type": "RECORD", + "description": "Metadata derived from JSTOR country" +} \ No newline at end of file diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_institution_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_institution_pilot.json new file mode 100644 index 00000000..c0d2c205 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_jstor_institution_pilot.json @@ -0,0 +1,62 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "ISBN13", + "type": "STRING", + "description": "ISBN of the book (13 digits)" + }, + { + "mode": "NULLABLE", + "name": "Book_Title", + "type": "STRING", + "description": "Title of the book" + }, + { + "mode": "NULLABLE", + "name": "Book_ID", + "type": "STRING", + "description": "DOI of the book on JSTOR" + }, + { + "mode": "NULLABLE", + "name": "Authors", + "type": "STRING", + "description": "" + }, + { + "mode": "NULLABLE", + "name": "ISBN", + "type": "STRING", + "description": "ISBN of the book (13 digits)" + }, + { + "mode": "NULLABLE", + "name": "eISBN", + "type": "STRING", + "description": "ISBN of the digital version of the book (13 digits)" + }, + { + "mode": "NULLABLE", + "name": "Copyright_Year", + "type": "INTEGER", + "description": "Publication year" + }, + { + "mode": "NULLABLE", + "name": "Disciplines", + "type": "STRING", + "description": "Subject category of the book" + }, + { + "mode": "NULLABLE", + "name": "Usage_Type", + "type": "STRING", + "description": "For our case it is Open Access" + } + ], + "mode": "NULLABLE", + "name": "jstor_institution_pilot", + "type": "RECORD", + "description": "Metadata derived from JSTOR Institutions" +} \ No newline at end of file diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_muse_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_muse_country_pilot.json new file mode 100644 index 00000000..4b74cd2b --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metadata_muse_country_pilot.json @@ -0,0 +1,39 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "ISBN13", + "type": "STRING", + "description": "ISBN of the book (13 digits)" + }, + { + "mode": "NULLABLE", + "name": "Book_Title", + "type": "STRING", + "description": "Title of the book" + }, + { + "mode": "NULLABLE", + "name": "Book_ID", + "type": "STRING", + "description": "DOI of the book on JSTOR" + }, + { + "mode": "NULLABLE", + "name": "Author", + "type": "STRING", + "description": "Author of the book" + }, + { + "mode": "NULLABLE", + "name": "Access_Type", + "type": "STRING", + "description": "Type of access" + } + ], + "mode": "NULLABLE", + "name": "muse_country_pilot", + "type": "RECORD", + "description": "Metadata derived from Muse" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_altmetrics_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_altmetrics_pilot.json new file mode 100644 index 00000000..ad5fee01 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_altmetrics_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "altmetrics_pilot", + "type": "RECORD", + "description": "Metrics derived from altmetrics" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_ltd_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_ltd_pilot.json new file mode 100644 index 00000000..ea089c67 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_ltd_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "amazon_ltd_pilot", + "type": "RECORD", + "description": "Metrics derived from amazon_ltd" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_pilot.json new file mode 100644 index 00000000..6dc1481f --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_amazon_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "amazon_pilot", + "type": "RECORD", + "description": "Metrics derived from amazon" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_internet_archive_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_internet_archive_pilot.json new file mode 100644 index 00000000..81a73a1a --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_internet_archive_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "internet_archive_pilot", + "type": "RECORD", + "description": "Metrics derived from internet_archive" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_country_pilot.json new file mode 100644 index 00000000..62a2da64 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_country_pilot.json @@ -0,0 +1,20 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Country_name", + "type": "STRING", + "description": "Country Name" + }, + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of request made from that specific country" + } + ], + "mode": "REPEATED", + "name": "jstor_country_pilot", + "type": "RECORD", + "description": "Metrics derived from JSTOR Country" +} \ No newline at end of file diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_institution_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_institution_pilot.json new file mode 100644 index 00000000..16cbc6ed --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_jstor_institution_pilot.json @@ -0,0 +1,20 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Institution", + "type": "STRING", + "description": "Institution name" + }, + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of request made from that specific institution" + } + ], + "mode": "REPEATED", + "name": "jstor_institution_pilot", + "type": "RECORD", + "description": "Metrics derived from JSTOR Institutions" +} \ No newline at end of file diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_country_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_country_pilot.json new file mode 100644 index 00000000..ebe6dc7e --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_country_pilot.json @@ -0,0 +1,21 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "Country_name", + "type": "STRING", + "description": "Country Name" + }, + { + "mode": "NULLABLE", + "name": "Total_Item_Requests", + "type": "INTEGER", + "description": "Total number of request made from that specific country" + } + ], + "mode": "REPEATED", + "name": "muse_country_pilot", + "type": "RECORD", + "description": "Metrics derived from Muse" +} + diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_metrics_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_metrics_pilot.json new file mode 100644 index 00000000..72b3b0bf --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_muse_metrics_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "muse_metrics_pilot", + "type": "RECORD", + "description": "Metrics derived from muse" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_oapen_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_oapen_pilot.json new file mode 100644 index 00000000..972e73f4 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_oapen_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "oapen_pilot", + "type": "RECORD", + "description": "Metrics derived from oapen" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarcommons_ltd_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarcommons_ltd_pilot.json new file mode 100644 index 00000000..1fbadd74 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarcommons_ltd_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "scholarcommons_ltd_pilot", + "type": "RECORD", + "description": "Metrics derived from scholarcommons_ltd" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_downloads_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_downloads_pilot.json new file mode 100644 index 00000000..2aeeeac4 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_downloads_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "scholarspace_downloads_pilot", + "type": "RECORD", + "description": "Metrics derived from scholarspace_downloads" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_views_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_views_pilot.json new file mode 100644 index 00000000..3308b57c --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_scholarspace_views_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "scholarspace_views_pilot", + "type": "RECORD", + "description": "Metrics derived from scholarspace_views" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_sci_open_pilot.json b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_sci_open_pilot.json new file mode 100644 index 00000000..21d495b1 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/book_product_metrics_sci_open_pilot.json @@ -0,0 +1,14 @@ +{ + "fields": [ + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER", + "description": "Metric value" + } + ], + "mode": "NULLABLE", + "name": "sci_open_pilot", + "type": "RECORD", + "description": "Metrics derived from Sci Open" +} diff --git a/dags/oaebu_workflows/schema/shmp_pilot/generic.json b/dags/oaebu_workflows/schema/shmp_pilot/generic.json new file mode 100644 index 00000000..6a873c97 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/generic.json @@ -0,0 +1,26 @@ +[ + { + "mode": "REQUIRED", + "name": "ISBN13", + "type": "STRING", + "description": "13 Digit ISBN of the book." + }, + { + "mode": "REQUIRED", + "name": "month", + "type": "DATE", + "description": "Start date for period of analytics info." + }, + { + "mode": "REQUIRED", + "name": "value", + "type": "INTEGER", + "description": "Measured value." + }, + { + "mode": "REQUIRED", + "name": "release_date", + "type": "DATE", + "description": "The end date of the release month" + } +] diff --git a/dags/oaebu_workflows/schema/shmp_pilot/muse_country.json b/dags/oaebu_workflows/schema/shmp_pilot/muse_country.json new file mode 100644 index 00000000..7a6e3268 --- /dev/null +++ b/dags/oaebu_workflows/schema/shmp_pilot/muse_country.json @@ -0,0 +1,122 @@ +[ + { + "description": "The id of the book.", + "mode": "NULLABLE", + "name": "ID", + "type": "STRING" + }, + { + "description": "Publication year.", + "mode": "NULLABLE", + "name": "YEAR", + "type": "INTEGER" + }, + { + "description": "Publication month.", + "mode": "NULLABLE", + "name": "MONTH", + "type": "INTEGER" + }, + { + "description": "Resource type.", + "mode": "NULLABLE", + "name": "RESOURCE_TYPE", + "type": "STRING" + }, + { + "description": "Resource id.", + "mode": "NULLABLE", + "name": "RESOURCE_ID", + "type": "STRING" + }, + { + "description": "ISBN of the book on MUSE.", + "mode": "NULLABLE", + "name": "ISBN", + "type": "STRING" + }, + { + "description": "Title of the book.", + "mode": "NULLABLE", + "name": "RESOURCE", + "type": "STRING" + }, + { + "description": "URL of the book.", + "mode": "NULLABLE", + "name": "RESOURCE_URL", + "type": "STRING" + }, + { + "description": "Date of launch.", + "mode": "NULLABLE", + "name": "RESOURCE_LAUNCH", + "type": "DATE" + }, + { + "description": "Author of the book.", + "mode": "NULLABLE", + "name": "AUTHOR", + "type": "STRING" + }, + { + "description": "Title of the chapter.", + "mode": "NULLABLE", + "name": "FULLTEXT_TITLE", + "type": "STRING" + }, + { + "description": "URL of the chapter.", + "mode": "NULLABLE", + "name": "FULLTEXT_URL", + "type": "STRING" + }, + { + "description": "Date of fulltext launch.", + "mode": "NULLABLE", + "name": "FULLTEXT_LAUNCH", + "type": "DATE" + }, + { + "description": "Issue.", + "mode": "NULLABLE", + "name": "ISSUE", + "type": "STRING" + }, + { + "description": "Format.", + "mode": "NULLABLE", + "name": "FORMAT", + "type": "STRING" + }, + { + "description": "Access type.", + "mode": "NULLABLE", + "name": "ACCESS", + "type": "STRING" + }, + { + "description": "Country Name.", + "mode": "NULLABLE", + "name": "COUNTRY", + "type": "STRING" + }, + { + "description": "Institution name.", + "mode": "NULLABLE", + "name": "INSTITUTION", + "type": "STRING" + }, + { + "description": "Number of requests.", + "mode": "NULLABLE", + "name": "REQUESTS", + "type": "INTEGER" + }, + { + "mode": "REQUIRED", + "name": "release_date", + "type": "DATE", + "description": "Last day of the release month. Table is partitioned on this column." + } +] diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_altmetrics_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_altmetrics_pilot.sql new file mode 100644 index 00000000..9157c26c --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_altmetrics_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.altmetrics_pilot.value) AS altmetrics_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_ltd_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_ltd_pilot.sql new file mode 100644 index 00000000..94747caa --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_ltd_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.amazon_ltd_pilot.value) AS amazon_ltd_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_pilot.sql new file mode 100644 index 00000000..79ec1fd4 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_amazon_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.amazon_pilot.value) AS amazon_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_jstor_country_pilot.sql new file mode 100644 index 00000000..541da542 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_jstor_country_pilot.sql @@ -0,0 +1 @@ +STRUCT(group_counts(ARRAY_CONCAT_AGG(month.jstor_country_pilot)) AS Total_Item_Requests) AS jstor_country_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_muse_country_pilot.sql new file mode 100644 index 00000000..3c913b1c --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_author_muse_country_pilot.sql @@ -0,0 +1 @@ +STRUCT(group_counts(ARRAY_CONCAT_AGG(month.muse_country_pilot)) AS Total_Item_Requests) AS muse_country_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_jstor_country_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_jstor_country_pilot.sql.jinja2 new file mode 100644 index 00000000..8a61dcdb --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_jstor_country_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +jstor_country_pilot_month_country as ( + SELECT + ISBN13, + month, + Country_name as alpha2, + -- for jstor collections, country_name is the full name not the alpha2 code. This is accounted for in the join to month_country.country_jstor_name instead of month_country.alpha2 + total_item_requests + FROM + months, + UNNEST(jstor_country_pilot) +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_muse_country_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_muse_country_pilot.sql.jinja2 new file mode 100644 index 00000000..679571a2 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_body_muse_country_pilot.sql.jinja2 @@ -0,0 +1,10 @@ +muse_month_country as ( + SELECT + ISBN13, + month, + Country_name as alpha2, + total_item_requests + FROM + months, + UNNEST(muse_country_pilot) +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_jstor_country_pilot.sql new file mode 100644 index 00000000..18d33a0f --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_jstor_country_pilot.sql @@ -0,0 +1,6 @@ +LEFT JOIN jstor_country_pilot_month_country as jstor_country_pilot ON month_country.ISBN13 = jstor_country_pilot.ISBN13 +AND month_country.month = jstor_country_pilot.month +AND ( + month_country.alpha2 = jstor_country_pilot.alpha2 + OR month_country.country_jstor_name = jstor_country_pilot.alpha2 +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_muse_country_pilot.sql new file mode 100644 index 00000000..752448b0 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_join_muse_country_pilot.sql @@ -0,0 +1,4 @@ +-- for muse collections, the alpha2 value is the full country name and needs to be matched on country_muse_name +LEFT JOIN muse_month_country as muse_country_pilot ON month_country.ISBN13 = muse_country_pilot.ISBN13 +AND month_country.month = muse_country_pilot.month +AND month_country.alpha2 = muse_country_pilot.alpha2 diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_jstor_country_pilot.sql new file mode 100644 index 00000000..db0f4aa0 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_jstor_country_pilot.sql @@ -0,0 +1 @@ +jstor_country_pilot.total_item_requests IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_muse_country_pilot.sql new file mode 100644 index 00000000..63e37f23 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_null_muse_country_pilot.sql @@ -0,0 +1 @@ +muse_country_pilot.total_item_requests IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_jstor_country_pilot.sql new file mode 100644 index 00000000..7e2e2af8 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_jstor_country_pilot.sql @@ -0,0 +1 @@ +STRUCT(jstor_country_pilot.Total_Item_Requests) as jstor_country_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_muse_country_pilot.sql new file mode 100644 index 00000000..03242974 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_country_struct_muse_country_pilot.sql @@ -0,0 +1 @@ +STRUCT(muse_country_pilot.Total_Item_Requests) as muse_country_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_internet_archive_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_internet_archive_pilot.sql new file mode 100644 index 00000000..99ed8571 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_internet_archive_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.internet_archive_pilot.value) AS internet_archive_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_jstor_country_pilot.sql new file mode 100644 index 00000000..767b540d --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_jstor_country_pilot.sql @@ -0,0 +1,8 @@ +STRUCT( + ( + SELECT + SUM(Total_Item_Requests) + FROM + UNNEST(month.jstor_country_pilot) + ) AS Total_Item_Requests +) AS jstor_country_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_country_pilot.sql new file mode 100644 index 00000000..1a21aa3d --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_country_pilot.sql @@ -0,0 +1,8 @@ +STRUCT( + ( + SELECT + SUM(Total_Item_Requests) + FROM + UNNEST(month.muse_country_pilot) + ) AS Total_Item_Requests +) AS muse \ No newline at end of file diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_metrics_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_metrics_pilot.sql new file mode 100644 index 00000000..4dd3db40 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_muse_metrics_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.muse_metrics_pilot.value) AS muse_metrics_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_oapen_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_oapen_pilot.sql new file mode 100644 index 00000000..c4b53162 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_oapen_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.oapen_pilot.value) AS oapen_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.sql new file mode 100644 index 00000000..1e0def88 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarcommons_ltd_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.scholarcommons_ltd_pilot.value) AS scholarcommons_ltd_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_downloads_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_downloads_pilot.sql new file mode 100644 index 00000000..a536857f --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_downloads_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.scholarspace_downloads_pilot.value) AS scholarspace_downloads_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_views_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_views_pilot.sql new file mode 100644 index 00000000..5e9b75aa --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_scholarspace_views_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.scholarspace_views_pilot.value) AS scholarspace_views_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_sci_open_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_sci_open_pilot.sql new file mode 100644 index 00000000..c0af11c3 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_metrics_sci_open_pilot.sql @@ -0,0 +1 @@ +STRUCT(month.sci_open_pilot.value) AS sci_open_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_altmetrics_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_altmetrics_pilot.sql.jinja2 new file mode 100644 index 00000000..ab437492 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_altmetrics_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +altmetrics_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ altmetrics_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_ltd_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_ltd_pilot.sql.jinja2 new file mode 100644 index 00000000..4191f603 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_ltd_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +amazon_ltd_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ amazon_ltd_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_pilot.sql.jinja2 new file mode 100644 index 00000000..3505976f --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_amazon_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +amazon_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ amazon_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_internet_archive_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_internet_archive_pilot.sql.jinja2 new file mode 100644 index 00000000..3a034207 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_internet_archive_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +internet_archive_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ internet_archive_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_country_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_country_pilot.sql.jinja2 new file mode 100644 index 00000000..c0ce74f3 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_country_pilot.sql.jinja2 @@ -0,0 +1,34 @@ +# The purpose of this block of SQL is to organise the metrics from JSTOR country for easier consumption of downstream queries. +# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_pilot_table_id'. +jstor_country_pilot_metrics as ( + SELECT + eISBN as ISBN13, + release_date, + group_items_jstor_country_pilot( + ARRAY_AGG(STRUCT(Country_name, Total_Item_Requests)) + ) as metrics + FROM + `{{ jstor_country_pilot_table_id }}` + GROUP BY + eISBN, + release_date +), + +# The purpose of this block of SQL is to organise the Metadata from JSTOR country for easier consumption of downstream queries. +# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_country_pilot_table_id'. +jstor_country_pilot_metadata as ( + SELECT + eISBN as ISBN13, + MAX(Book_Title) as Book_Title, + MAX(Book_ID) as Book_ID, + MAX(Authors) as Authors, + MAX(ISBN) as ISBN, + eISBN, + MAX(Copyright_Year) as Copyright_Year, + MAX(Disciplines) as Disciplines, + MAX(Usage_Type) as Usage_Type + FROM + `{{ jstor_country_pilot_table_id }}` + GROUP BY + eISBN +) \ No newline at end of file diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_institution_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_institution_pilot.sql.jinja2 new file mode 100644 index 00000000..ad54c742 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_jstor_institution_pilot.sql.jinja2 @@ -0,0 +1,34 @@ +# The purpose of this block of SQL is to organise the metrics from JSTOR institution for easier consumption of downstream queries. +# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_pilot_table_id'. +jstor_institution_pilot_metrics as ( + SELECT + eISBN as ISBN13, + release_date, + group_items_jstor_institution_pilot( + ARRAY_AGG(STRUCT(Institution, Total_Item_Requests)) + ) as metrics + FROM + `{{ jstor_institution_pilot_table_id }}` + GROUP BY + eISBN, + release_date +), + +# The purpose of this block of SQL is to organise the Metadata from JSTOR institution for easier consumption of downstream queries. +# Defined in the create_oaebu_book_product_table method, in onix_workflow.py, is the value of 'jstor_institution_pilot_table_id'. +jstor_institution_pilot_metadata as ( + SELECT + eISBN as ISBN13, + MAX(Book_Title) as Book_Title, + MAX(Book_ID) as Book_ID, + MAX(Authors) as Authors, + MAX(ISBN) as ISBN, + eISBN, + MAX(Copyright_Year) as Copyright_Year, + MAX(Disciplines) as Disciplines, + MAX(Usage_Type) as Usage_Type + FROM + `{{ jstor_institution_pilot_table_id }}` + GROUP BY + eISBN +) \ No newline at end of file diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_country_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_country_pilot.sql.jinja2 new file mode 100644 index 00000000..9692f774 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_country_pilot.sql.jinja2 @@ -0,0 +1,25 @@ +muse_country_pilot_metrics as ( + SELECT + ISBN as ISBN13, + release_date, + group_items_muse_country_pilot( + ARRAY_AGG(STRUCT(COUNTRY, REQUESTS)) + ) as metrics + FROM + `{{ muse_country_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +), + +muse_country_pilot_metadata as ( + SELECT + ISBN as ISBN13, + MAX(RESOURCE) as Book_Title, + MAX(AUTHOR) as Author, + MAX(Access) as Access_Type + FROM + `{{ muse_country_pilot_table_id }}` + GROUP BY + ISBN13 +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_metrics_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_metrics_pilot.sql.jinja2 new file mode 100644 index 00000000..97833598 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_muse_metrics_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +muse_metrics_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ muse_metrics_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_oapen_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_oapen_pilot.sql.jinja2 new file mode 100644 index 00000000..5c139ade --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_oapen_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +oapen_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ oapen_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarcommons_ltd_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarcommons_ltd_pilot.sql.jinja2 new file mode 100644 index 00000000..94197d22 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarcommons_ltd_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +scholarcommons_ltd_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ scholarcommons_ltd_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_downloads_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_downloads_pilot.sql.jinja2 new file mode 100644 index 00000000..c9b5a122 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_downloads_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +scholarspace_downloads_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ scholarspace_downloads_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_views_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_views_pilot.sql.jinja2 new file mode 100644 index 00000000..59e6d303 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_scholarspace_views_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +scholarspace_views_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ scholarspace_views_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_sci_open_pilot.sql.jinja2 b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_sci_open_pilot.sql.jinja2 new file mode 100644 index 00000000..96023ae5 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_body_sci_open_pilot.sql.jinja2 @@ -0,0 +1,11 @@ +sci_open_pilot_metrics as ( + SELECT + ISBN13, + release_date, + STRUCT(SUM(value) as value) as metrics, + FROM + `{{ sci_open_pilot_table_id }}` + GROUP BY + ISBN13, + release_date +) diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_country_pilot.sql new file mode 100644 index 00000000..f395e448 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_country_pilot.sql @@ -0,0 +1,19 @@ +-- Output Schema: +-- Country_name STRING NULLABLE +-- Total_Item_Requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_jstor_country_pilot( + items ARRAY < STRUCT < Country_name STRING, + Total_Item_Requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT Country_name, + SUM(Total_Item_Requests) as Total_Item_Requests, + FROM + UNNEST(items) + GROUP BY + Country_name + ) + ) +); \ No newline at end of file diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_institution_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_institution_pilot.sql new file mode 100644 index 00000000..c84cd036 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_jstor_institution_pilot.sql @@ -0,0 +1,19 @@ +-- Output Schema: +-- Institution STRING NULLABLE +-- Total_Item_Requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_jstor_institution_pilot( + items ARRAY < STRUCT < Institution STRING, + Total_Item_Requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT Institution, + SUM(Total_Item_Requests) as Total_Item_Requests, + FROM + UNNEST(items) + GROUP BY + Institution + ) + ) +); \ No newline at end of file diff --git a/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_muse_country_pilot.sql new file mode 100644 index 00000000..f3821eb5 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/book_product_functions_muse_country_pilot.sql @@ -0,0 +1,19 @@ +-- Output Schema: +-- Country_name STRING NULLABLE +-- Total_Item_Requests INTEGER NULLABLE +CREATE TEMP FUNCTION group_items_muse_country_pilot( + items ARRAY < STRUCT < Country_name STRING, + Total_Item_Requests INT64 > > +) as ( + ARRAY( + ( + SELECT + AS STRUCT Country_name, + SUM(Total_Item_Requests) as Total_Item_Requests, + FROM + UNNEST(items) + GROUP BY + Country_name + ) + ) +); diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_jstor_country_pilot.sql new file mode 100644 index 00000000..3cd66481 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_jstor_country_pilot.sql @@ -0,0 +1,3 @@ +STRUCT( + group_counts(ARRAY_CONCAT_AGG(month.jstor_country_pilot)) as Total_Item_Requests +) as jstor_country_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_muse_country_pilot.sql new file mode 100644 index 00000000..499d0776 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_metrics_sum_muse_country_pilot.sql @@ -0,0 +1,3 @@ +STRUCT( + group_counts(ARRAY_CONCAT_AGG(month.muse_country_pilot)) as Total_Item_Requests +) as muse_country_pilot diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_altmetrics_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_altmetrics_pilot.sql new file mode 100644 index 00000000..7bab3c74 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_altmetrics_pilot.sql @@ -0,0 +1 @@ +month.altmetrics_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_ltd_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_ltd_pilot.sql new file mode 100644 index 00000000..8ad2b649 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_ltd_pilot.sql @@ -0,0 +1 @@ +month.amazon_ltd_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_pilot.sql new file mode 100644 index 00000000..fd1281c7 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_amazon_pilot.sql @@ -0,0 +1 @@ +month.amazon_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_internet_archive_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_internet_archive_pilot.sql new file mode 100644 index 00000000..60bb9571 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_internet_archive_pilot.sql @@ -0,0 +1 @@ +month.internet_archive_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_jstor_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_jstor_country_pilot.sql new file mode 100644 index 00000000..7a856197 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_jstor_country_pilot.sql @@ -0,0 +1 @@ +ARRAY_LENGTH(month.jstor_country_pilot) > 0 \ No newline at end of file diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_country_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_country_pilot.sql new file mode 100644 index 00000000..8c0520fd --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_country_pilot.sql @@ -0,0 +1 @@ +ARRAY_LENGTH(month.muse_country_pilot) > 0 \ No newline at end of file diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_metrics_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_metrics_pilot.sql new file mode 100644 index 00000000..15cdf7df --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_muse_metrics_pilot.sql @@ -0,0 +1 @@ +month.muse_metrics_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_oapen_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_oapen_pilot.sql new file mode 100644 index 00000000..54922195 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_oapen_pilot.sql @@ -0,0 +1 @@ +month.oapen_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarcommons_ltd_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarcommons_ltd_pilot.sql new file mode 100644 index 00000000..1e2e618e --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarcommons_ltd_pilot.sql @@ -0,0 +1 @@ +month.scholarcommons_ltd_pilot is not NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_downloads_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_downloads_pilot.sql new file mode 100644 index 00000000..958b2c61 --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_downloads_pilot.sql @@ -0,0 +1 @@ +month.scholarspace_downloads_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_views_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_views_pilot.sql new file mode 100644 index 00000000..f441d1ea --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_scholarspace_views_pilot.sql @@ -0,0 +1 @@ +month.scholarspace_views_pilot IS NOT NULL diff --git a/dags/oaebu_workflows/sql/shmp_pilot/month_null_sci_open_pilot.sql b/dags/oaebu_workflows/sql/shmp_pilot/month_null_sci_open_pilot.sql new file mode 100644 index 00000000..0c2a8d6f --- /dev/null +++ b/dags/oaebu_workflows/sql/shmp_pilot/month_null_sci_open_pilot.sql @@ -0,0 +1 @@ +month.sci_open_pilot IS NOT NULL diff --git a/tests/test_oaebu_partners.py b/tests/test_oaebu_partners.py index 2850812f..8798b790 100644 --- a/tests/test_oaebu_partners.py +++ b/tests/test_oaebu_partners.py @@ -22,6 +22,7 @@ OaebuPartner, DataPartner, partner_from_str, + create_bespoke_data_partner, OAEBU_DATA_PARTNERS, OAEBU_METADATA_PARTNERS, ) @@ -42,7 +43,7 @@ export_book_metrics=True, export_country=True, export_subject=True, - has_metdata=True, + has_metadata=True, ) } MOCK_METADATA_PARTNERS = { @@ -83,3 +84,63 @@ def test_invalid_partner_name_string(self): # Call the function with a valid metadata partner but without the flag with self.assertRaisesRegex(KeyError, "Partner not found: md_partner"): partner_from_str("md_partner", metadata_partner=False) + + +class TestBespokePartner(unittest.TestCase): + def test_valid_case(self): + """Test that no errors are raised when a valid input is supplied""" + input = { + "type_id": "type_id", + "bq_dataset_id": "bq_dataset_id", + "bq_table_name": "bq_table_name", + "isbn_field_name": "isbn_field_name", + "title_field_name": "title_field_name", + "sharded": True, + "schema_path": "schema_path", + "schema_directory": "schema_directory", + "sql_directory": "sql_directory", + "book_product_functions": "book_product_functions", + "export_author": False, + "export_book_metrics": False, + "export_country": False, + "export_subject": False, + "has_metadata": True, + } + expected_output = DataPartner(**input) + actual_output = create_bespoke_data_partner(input) + self.assertEqual(expected_output, actual_output) + + def test_missing_required(self): + """Test that an error is raised when an input is missing a required parameter""" + input = { + "export_author": False, + "export_book_metrics": False, + "export_country": False, + "export_subject": False, + "has_metadata": True, + } + with self.assertRaisesRegex(NameError, "Missing required parameters"): + create_bespoke_data_partner(input) + + def test_unrecognised_input(self): + """Test that an error is raised when an unrecognised parameter is supplied""" + input = { + "type_id": "type_id", + "bq_dataset_id": "bq_dataset_id", + "bq_table_name": "bq_table_name", + "isbn_field_name": "isbn_field_name", + "title_field_name": "title_field_name", + "sharded": True, + "schema_path": "schema_path", + "schema_directory": "schema_directory", + "sql_directory": "sql_directory", + "book_product_functions": "book_product_functions", + "export_author": False, + "export_book_metrics": False, + "export_country": False, + "export_subject": False, + "has_metadata": True, + "unrecognised_argument": "foo", + } + with self.assertRaisesRegex(NameError, "Unrecognised arguments supplied"): + create_bespoke_data_partner(input)