diff --git a/processes/ingest/publisher_backlist.py b/processes/ingest/publisher_backlist.py index 5cff6522967..713dcb78506 100644 --- a/processes/ingest/publisher_backlist.py +++ b/processes/ingest/publisher_backlist.py @@ -1,5 +1,5 @@ import os -from ..util import airtable_integration +from services import PublisherBacklistService from ..core import CoreProcess from logger import create_log @@ -10,24 +10,39 @@ class PublisherBacklistProcess(CoreProcess): def __init__(self, *args): super(PublisherBacklistProcess, self).__init__(*args[:4]) - self.ingest_offset = int(args[5] or 0) - self.ingest_limit = (int(args[4]) + self.ingestOffset) if args[4] else 5000 - self.full_import = self.process == 'complete' - - self.generateEngine() - self.createSession() + self.limit = (len(args) >= 5 and args[4] and args(4) <= 100) or None + self.offset = (len(args) >= 6 and args[5]) or None self.s3_bucket = os.environ['FILE_BUCKET'] self.createS3Client() + self.publisher_backlist_service = PublisherBacklistService() + def runProcess(self): try: - - response = airtable_integration.create_airtable_request() + self.generateEngine() + self.createSession() + + if self.process == 'daily': + records = self.publisher_backlist_service.get_records(offset=self.offset, limit=self.limit) + elif self.process == 'complete': + records = self.publisher_backlist_service.get_records(full_import=True) + elif self.process == 'custom': + records = self.publisher_backlist_service.get_records(start_timestamp=self.ingestPeriod, offset=self.offset, limit=self.limit) + else: + logger.warning(f'Unknown Publisher Backlist ingestion process type {self.process}') + return + + for record in records: + self.addDCDWToUpdateList(record) - print(response) + self.saveRecords() + self.commitChanges() + + logger.info(f'Ingested {len(self.records)} Publisher Backlist records') except Exception as e: - logger.exception('Failed to run Pub Backlist process') - raise e - \ No newline at end of file + logger.exception('Failed to run Publisher Backlist process') + raise e + finally: + self.close_connection() diff --git a/processes/util/airtable_integration.py b/processes/util/airtable_integration.py deleted file mode 100644 index b3ceedebc9f..00000000000 --- a/processes/util/airtable_integration.py +++ /dev/null @@ -1,17 +0,0 @@ -import os -import requests - -def create_authorization_header(): - bearer_token = os.environ.get("AIRTABLE_KEY") - - headers = {"Authorization": f"Bearer {bearer_token}"} - - return headers - -def create_airtable_request(): - - headers = create_authorization_header() - - response = requests.get('https://api.airtable.com/v0/appBoLf4lMofecGPU/Publisher%20Backlists%20%26%20Collections%20%F0%9F%93%96?view=UofMichigan%20Press&maxRecords=3', headers=headers) - - return response.json() diff --git a/services/__init__.py b/services/__init__.py index 9fdc81fc14b..4b3e239bce6 100644 --- a/services/__init__.py +++ b/services/__init__.py @@ -1 +1,2 @@ from .sources.nypl_bib_service import NYPLBibService +from .sources.publisher_backlist_service import PublisherBacklistService diff --git a/services/sources/publisher_backlist_service.py b/services/sources/publisher_backlist_service.py new file mode 100644 index 00000000000..74e5c82326f --- /dev/null +++ b/services/sources/publisher_backlist_service.py @@ -0,0 +1,99 @@ +from datetime import datetime, timedelta, timezone +import os +import requests +import json +import urllib.parse +from typing import Optional, Dict + +from logger import create_log +from mappings.UofM import UofMMapping +from .source_service import SourceService + +logger = create_log(__name__) + +BASE_URL = "https://api.airtable.com/v0/appBoLf4lMofecGPU/Publisher%20Backlists%20%26%20Collections%20%F0%9F%93%96?view=All%20Lists" + +class PublisherBacklistService(SourceService): + def __init__(self): + self.airtable_auth_token = os.environ.get('AIRTABLE_KEY', None) + + def get_records( + self, + full_import: bool=False, + start_timestamp: datetime=None, + offset: Optional[int]=None, + limit: Optional[int]=None + ) -> list[UofMMapping]: + array_json_records = self.get_records_json(full_import, start_timestamp, offset, limit) + + for json_dict in array_json_records: + for records_value in json_dict['records']: + try: + record_metadata_dict = records_value + record = UofMMapping(record_metadata_dict) + record.applyMapping() + except Exception: + logger.exception(f'Failed to process Publisher Backlist record') + return array_json_records + + def get_records_json(self, + full_import: bool=False, + start_timestamp: datetime=None, + offset: Optional[int]=None, + limit: Optional[int]=None + ) -> list[dict]: + if offset == None: + limit = 100 + + if not full_import: + if start_timestamp: + filter_by_formula = self.build_filter_by_formula_parameter(start_timestamp) + + array_json_records = self.get_records_array(limit, filter_by_formula) + + return array_json_records + + else: + filter_by_formula = self.build_filter_by_formula_parameter(start_timestamp) + + array_json_records = self.get_records_array(limit, filter_by_formula) + + return array_json_records + + array_json_records = self.get_records_array(limit, filter_by_formula=None) + + return array_json_records + + def build_filter_by_formula_parameter(self, start_timestamp: datetime=None) -> str: + if not start_timestamp: + start_timestamp = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(hours=24) + + start_date_time_str = start_timestamp.strftime("%Y-%m-%d %H:%M:%S.%f") + start_date_time_encoded = urllib.parse.quote(start_date_time_str) + is_same_date_time_filter = f"IS_SAME(%7BLast%20Modified%7D,%20%22{start_date_time_encoded}%22" + is_after_date_time_filter = f"%20IS_AFTER(%7BLast%20Modified%7D,%20%22{start_date_time_encoded}%22" + filter_by_formula = f"OR({is_same_date_time_filter}),{is_after_date_time_filter}))" + + return filter_by_formula + + def get_records_array(self, + limit: Optional[int]=None, + filter_by_formula: str=None, + ) -> list[dict]: + url = f'{BASE_URL}&pageSize={limit}' + headers = {"Authorization": f"Bearer {self.airtable_auth_token}"} + + if filter_by_formula: + url += f'&filterByFormula{filter_by_formula}' + + pub_backlist_records_response = requests.get(url, headers=headers) + pub_backlist_records_response_json = pub_backlist_records_response.json() + array_json = [pub_backlist_records_response_json] + + while 'offset' in pub_backlist_records_response_json: + next_page_url = url + f"&offset={pub_backlist_records_response_json['offset']}" + pub_backlist_records_response = requests.get(next_page_url, headers=headers) + pub_backlist_records_response_json = pub_backlist_records_response.json() + array_json.append(pub_backlist_records_response_json) + + return array_json diff --git a/tests/integration/services/sources/test_publisher_backlist_service.py b/tests/integration/services/sources/test_publisher_backlist_service.py new file mode 100644 index 00000000000..b4938324175 --- /dev/null +++ b/tests/integration/services/sources/test_publisher_backlist_service.py @@ -0,0 +1,19 @@ +from datetime import datetime, timezone, timedelta +import pytest + +from load_env import load_env_file +from services import PublisherBacklistService + +class TestPublisherBacklistService: + @pytest.fixture + def test_instance(self): + load_env_file('local', file_string='config/local.yaml') + return PublisherBacklistService() + + def test_get_records(self, test_instance: PublisherBacklistService): + records = test_instance.get_records( + start_timestamp=datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(days=7), + limit=100 + ) + + assert records is not None diff --git a/tests/unit/processes/ingest/test_pub_backlist_process.py b/tests/unit/processes/ingest/test_pub_backlist_process.py new file mode 100644 index 00000000000..999f83905ec --- /dev/null +++ b/tests/unit/processes/ingest/test_pub_backlist_process.py @@ -0,0 +1,106 @@ +import pytest + +from tests.helper import TestHelpers +from processes import PublisherBacklistProcess + + +class TestPublisherBacklistProcess: + @classmethod + def setup_class(cls): + TestHelpers.setEnvVars() + + @classmethod + def teardown_class(cls): + TestHelpers.clearEnvVars() + + @pytest.fixture + def test_instance(self, mocker) -> PublisherBacklistProcess: + class TestPublisherBacklistProcess(PublisherBacklistProcess): + def __init__(self, *args): + self.publisher_backlist_service = mocker.MagicMock() + self.offset = None + self.limit = None + self.ingestPeriod = None + self.records = [] + + return TestPublisherBacklistProcess('TestProcess', 'testFile', 'testDate') + + @pytest.fixture + def mocks(self, mocker): + return { + 'generate_engine': mocker.patch.object(PublisherBacklistProcess, 'generateEngine'), + 'create_session': mocker.patch.object(PublisherBacklistProcess, 'createSession'), + 'save': mocker.patch.object(PublisherBacklistProcess, 'saveRecords'), + 'commit': mocker.patch.object(PublisherBacklistProcess, 'commitChanges'), + 'close': mocker.patch.object(PublisherBacklistProcess, 'close_connection'), + 'add_record': mocker.patch.object(PublisherBacklistProcess, 'addDCDWToUpdateList') + } + + @pytest.fixture + def record_mappings(self, mocker): + return [mocker.MagicMock()] + + def assert_common_expectations(self, mocks): + mocks['generate_engine'].assert_called_once() + mocks['create_session'].assert_called_once() + mocks['save'].assert_called_once() + mocks['commit'].assert_called_once() + mocks['close'].assert_called_once() + + def test_runProcess_daily(self, test_instance: PublisherBacklistProcess, record_mappings, mocks): + test_instance.publisher_backlist_service.get_records.return_value = record_mappings + + test_instance.process = 'daily' + test_instance.runProcess() + + test_instance.publisher_backlist_service.get_records.assert_called_once_with(offset=None, limit=None) + self.assert_common_expectations(mocks) + assert mocks['add_record'].call_count == len(record_mappings) + + def test_runProcess_complete(self, test_instance: PublisherBacklistProcess, record_mappings, mocks): + test_instance.publisher_backlist_service.get_records.return_value = record_mappings + + test_instance.process = 'complete' + test_instance.runProcess() + + test_instance.publisher_backlist_service.get_records.assert_called_once_with(full_import=True) + self.assert_common_expectations(mocks) + assert mocks['add_record'].call_count == len(record_mappings) + + def test_runProcess_custom(self, test_instance: PublisherBacklistProcess, record_mappings, mocks): + test_instance.publisher_backlist_service.get_records.return_value = record_mappings + + test_instance.process = 'custom' + test_instance.runProcess() + + test_instance.publisher_backlist_service.get_records.assert_called_once_with(start_timestamp=None, offset=None, limit=None) + self.assert_common_expectations(mocks) + assert mocks['add_record'].call_count == len(record_mappings) + + def test_runProcess_unknown(self, test_instance: PublisherBacklistProcess, mocks): + test_instance.process = 'unknown' + test_instance.runProcess() + + test_instance.publisher_backlist_service.get_records.assert_not_called() + + mocks['generate_engine'].assert_called_once() + mocks['create_session'].assert_called_once() + mocks['commit'].assert_not_called() + mocks['save'].assert_not_called() + mocks['add_record'].assert_not_called() + mocks['close'].assert_called_once() + + def test_runProcess_error(self, test_instance: PublisherBacklistProcess, mocks): + test_instance.publisher_backlist_service.get_records.side_effect = Exception() + + test_instance.process = 'daily' + + with pytest.raises(Exception): + test_instance.runProcess() + + mocks['generate_engine'].assert_called_once() + mocks['create_session'].assert_called_once() + mocks['commit'].assert_not_called() + mocks['save'].assert_not_called() + mocks['add_record'].assert_not_called() + mocks['close'].assert_called_once()