diff --git a/managers/s3.py b/managers/s3.py index 0715907e2d..075a6163bf 100644 --- a/managers/s3.py +++ b/managers/s3.py @@ -36,6 +36,12 @@ def createS3Bucket(self, bucketName, bucketPermissions): raise S3Error('Unable to create bucket in s3') + + def createManifestInS3(self, manifestPath, manifestJSON): + self.putObjectInBucket( + manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket + ) + def putObjectInBucket( self, obj, objKey, bucket, bucketPermissions='public-read' ): diff --git a/mappings/chicagoISAC.py b/mappings/chicagoISAC.py index adf2cfe0fc..83abe33261 100644 --- a/mappings/chicagoISAC.py +++ b/mappings/chicagoISAC.py @@ -23,7 +23,6 @@ def createMapping(self): def applyFormatting(self): self.record.source = 'isac' - #Formatting for multiple isbns if ',' in self.record.identifiers[0]: identifierArray = self.record.identifiers[0].split(', ') updatedIdentifierArray = [] diff --git a/processes/ingest/chicago_isac.py b/processes/ingest/chicago_isac.py index f26f996b03..f8583cc71f 100644 --- a/processes/ingest/chicago_isac.py +++ b/processes/ingest/chicago_isac.py @@ -1,9 +1,7 @@ import json import os -from requests.exceptions import HTTPError, ConnectionError from ..core import CoreProcess -from mappings.core import MappingError from mappings.chicagoISAC import ChicagoISACMapping from managers import WebpubManifest from logger import createLog @@ -15,95 +13,89 @@ class ChicagoISACProcess(CoreProcess): def __init__(self, *args): super(ChicagoISACProcess, self).__init__(*args[:4]) - self.ingestOffset = int(args[5] or 0) - self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000 - self.fullImport = self.process == 'complete' - - # Connect to database self.generateEngine() self.createSession() - # S3 Configuration self.s3Bucket = os.environ['FILE_BUCKET'] self.createS3Client() - def runProcess(self): + def runProcess(self): with open('ingestJSONFiles/chicagoISAC_metadata.json') as f: - chicagoISACData = json.load(f) + chicago_isac_data = json.load(f) - for metaDict in chicagoISACData: - self.processChicagoISACRecord(metaDict) + for meta_dict in chicago_isac_data: + self.process_chicago_isac_record(meta_dict) self.saveRecords() self.commitChanges() - def processChicagoISACRecord(self, record): + logger.info(f'Ingested {len(self.records)} ISAC records') + + def process_chicago_isac_record(self, record): try: - chicagoISACRec = ChicagoISACMapping(record) - chicagoISACRec.applyMapping() - self.storePDFManifest(chicagoISACRec.record) - self.addDCDWToUpdateList(chicagoISACRec) + chicago_isac_rec = ChicagoISACMapping(record) + chicago_isac_rec.applyMapping() + self.store_pdf_manifest(chicago_isac_rec.record) + self.addDCDWToUpdateList(chicago_isac_rec) - except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e: - logger.exception(e) - logger.warn(ChicagoISACError('Unable to process ISAC record')) + except Exception: + logger.exception(ChicagoISACError('Unable to process ISAC record')) - def storePDFManifest(self, record): + def store_pdf_manifest(self, record): for link in record.has_part: - itemNo, uri, source, mediaType, flags = link[0].split('|') + item_no, uri, source, media_type, flags = link[0].split('|') - if mediaType == 'application/pdf': - recordID = record.identifiers[0].split('|')[0] + if media_type == 'application/pdf': + record_id = record.identifiers[0].split('|')[0] - manifestPath = 'manifests/{}/{}.json'.format(source, recordID) - manifestURI = 'https://{}.s3.amazonaws.com/{}'.format( - self.s3Bucket, manifestPath + manifest_path = 'manifests/{}/{}.json'.format(source, record_id) + manifest_url = 'https://{}.s3.amazonaws.com/{}'.format( + self.s3Bucket, manifest_path ) - manifestJSON = self.generateManifest(record, uri, manifestURI) + manifest_json = self.generate_manifest(record, uri, manifest_url) - self.createManifestInS3(manifestPath, manifestJSON) + self.createManifestInS3(manifest_path, manifest_json) - linkString = '|'.join([ - itemNo, - manifestURI, + link_string = '|'.join([ + item_no, + manifest_url, source, 'application/webpub+json', flags ]) - record.has_part.insert(0, linkString) - #The second element of the has_part array will always be an array of pdfURLS from the json file - #Those elements of the array are popped out and appended back into the has_part array as strings instead of an array - for i in range(1, len(record.has_part)): - if len(record.has_part[i]) > 1: - urlArray = record.has_part[i] - record.has_part.pop(i) - for elem in urlArray: - record.has_part.append(elem) - else: - record.has_part[i] = ''.join(record.has_part[i]) - break - def createManifestInS3(self, manifestPath, manifestJSON): - self.putObjectInBucket( - manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket - ) + record.has_part.insert(0, link_string) + self.change_has_part_url_array_to_string(record) + + break + + @staticmethod + def change_has_part_url_array_to_string(record): + for i in range(1, len(record.has_part)): + if len(record.has_part[i]) > 1: + url_array = record.has_part[i] + record.has_part.pop(i) + for elem in url_array: + record.has_part.append(elem) + else: + record.has_part[i] = ''.join(record.has_part[i]) @staticmethod - def generateManifest(record, sourceURI, manifestURI): - manifest = WebpubManifest(sourceURI, 'application/pdf') + def generate_manifest(record, source_url, manifest_url): + manifest = WebpubManifest(source_url, 'application/pdf') manifest.addMetadata( record, conformsTo=os.environ['WEBPUB_PDF_PROFILE'] ) - manifest.addChapter(sourceURI, record.title) + manifest.addChapter(source_url, record.title) manifest.links.append({ 'rel': 'self', - 'href': manifestURI, + 'href': manifest_url, 'type': 'application/webpub+json' }) diff --git a/processes/ingest/doab.py b/processes/ingest/doab.py index a492a5641e..a9b70de459 100644 --- a/processes/ingest/doab.py +++ b/processes/ingest/doab.py @@ -158,9 +158,6 @@ def downloadOAIRecords(self, fullOrPartial, startTimestamp, resumptionToken=None raise DOABError(f'Received {doabResponse.status_code} status code from {doabURL}') - def createManifestInS3(self, manifestPath, manifestJSON): - self.putObjectInBucket(manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket) - def sendFileToProcessingQueue(self, fileURL, s3Location): s3Message = { 'fileData': { diff --git a/processes/ingest/loc.py b/processes/ingest/loc.py index 746cf6aa19..cba0f7f9dc 100644 --- a/processes/ingest/loc.py +++ b/processes/ingest/loc.py @@ -254,11 +254,6 @@ def storeEpubsInS3(self, record): self.sendFileToProcessingQueue(uri, bucketLocation) break - def createManifestInS3(self, manifestPath, manifestJSON): - self.putObjectInBucket( - manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket - ) - def addEPUBManifest(self, record, itemNo, source, flagStr, mediaType, location): s3URL = 'https://{}.s3.amazonaws.com/{}'.format(self.s3Bucket, location) diff --git a/processes/ingest/met.py b/processes/ingest/met.py index 345e86420f..440d8ab6b3 100644 --- a/processes/ingest/met.py +++ b/processes/ingest/met.py @@ -195,11 +195,6 @@ def storePDFManifest(self, record): break - def createManifestInS3(self, manifestPath, manifestJSON): - self.putObjectInBucket( - manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket - ) - @staticmethod def generateManifest(record, sourceURI, manifestURI): manifest = WebpubManifest(sourceURI, 'application/pdf') diff --git a/processes/ingest/u_of_m.py b/processes/ingest/u_of_m.py index 16eb11aad0..9cf93ba16e 100644 --- a/processes/ingest/u_of_m.py +++ b/processes/ingest/u_of_m.py @@ -166,11 +166,6 @@ def storePDFManifest(self, record): record.has_part.insert(0, linkString) break - def createManifestInS3(self, manifestPath, manifestJSON): - self.putObjectInBucket( - manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket - ) - @staticmethod def generateManifest(record, sourceURI, manifestURI): manifest = WebpubManifest(sourceURI, 'application/pdf') diff --git a/processes/ingest/u_of_sc.py b/processes/ingest/u_of_sc.py index b08fdb8897..86938e431c 100644 --- a/processes/ingest/u_of_sc.py +++ b/processes/ingest/u_of_sc.py @@ -76,11 +76,6 @@ def storePDFManifest(self, record): record.has_part.insert(0, linkString) break - def createManifestInS3(self, manifestPath, manifestJSON): - self.putObjectInBucket( - manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket - ) - @staticmethod def generateManifest(record, sourceURI, manifestURI): manifest = WebpubManifest(sourceURI, 'application/pdf') diff --git a/tests/unit/processes/ingest/test_chicago_isac_process.py b/tests/unit/processes/ingest/test_chicago_isac_process.py index d3d930e8ec..8a768c0620 100644 --- a/tests/unit/processes/ingest/test_chicago_isac_process.py +++ b/tests/unit/processes/ingest/test_chicago_isac_process.py @@ -14,91 +14,84 @@ def teardown_class(cls): TestHelpers.clearEnvVars() @pytest.fixture - def testProcess(self, mocker): + def test_process(self, mocker): class TestISAC(ChicagoISACProcess): def __init__(self): self.s3Bucket = 'test_aws_bucket' - self.s3Client = mocker.MagicMock(s3Client='testS3Client') - self.session = mocker.MagicMock(session='testSession') - self.records = mocker.MagicMock(record='testRecord') - self.batchSize = mocker.MagicMock(batchSize='testBatchSize') + self.s3_client = mocker.MagicMock(s3_client='test_s3_client') + self.session = mocker.MagicMock(session='test_session') + self.records = mocker.MagicMock(record='test_record') + self.batch_size = mocker.MagicMock(batch_size='test_batch_size') return TestISAC() - def test_runProcess(self, testProcess, mocker): - runMocks = mocker.patch.multiple( + def test_run_process(self, test_process, mocker): + run_mocks = mocker.patch.multiple( ChicagoISACProcess, saveRecords=mocker.DEFAULT, commitChanges=mocker.DEFAULT ) - testProcess.runProcess() + test_process.runProcess() - runMocks['saveRecords'].assert_called_once() - runMocks['commitChanges'].assert_called_once() + run_mocks['saveRecords'].assert_called_once() + run_mocks['commitChanges'].assert_called_once() - def test_processChicagoISACRecord_success(self, testProcess, mocker): + def test_process_chicago_isac_record_success(self, test_process, mocker): processMocks = mocker.patch.multiple(ChicagoISACProcess, - storePDFManifest=mocker.DEFAULT, + store_pdf_manifest=mocker.DEFAULT, addDCDWToUpdateList=mocker.DEFAULT ) - mockMapping = mocker.MagicMock(record='testRecord') - mockMapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping') - mockMapper.return_value = mockMapping + mock_mapping = mocker.MagicMock(record='test_record') + mock_mapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping') + mock_mapper.return_value = mock_mapping - testProcess.processChicagoISACRecord(mockMapping) + test_process.process_chicago_isac_record(mock_mapping) - mockMapping.applyMapping.assert_called_once() + mock_mapping.applyMapping.assert_called_once() - processMocks['storePDFManifest'].assert_called_once_with('testRecord') - processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping) + processMocks['store_pdf_manifest'].assert_called_once_with('test_record') + processMocks['addDCDWToUpdateList'].assert_called_once_with(mock_mapping) - def test_processChicagoISACRecord_error(self, mocker): + def test_process_chicago_isac_record_error(self, mocker): - mockMapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping') - mockMapper.side_effect = MappingError('testError') + mock_mapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping') + mock_mapper.side_effect = MappingError('testError') assert pytest.raises(MappingError) - def test_storePDFManifest(self, testProcess, mocker): - mockRecord = mocker.MagicMock(identifiers=['1|isac']) - mockRecord.has_part = [ - ['1|testURI|isac|application/pdf|{}', - '2|testURIOther|isac|application/pdf|{}'], + def test_store_pdf_manifest(self, test_process, mocker): + mock_record = mocker.MagicMock(identifiers=['1|isac']) + mock_record.has_part = [ + ['1|test_url|isac|application/pdf|{}', + '2|test_url_other|isac|application/pdf|{}'], ] - mockGenerateMan = mocker.patch.object(ChicagoISACProcess, 'generateManifest') - mockGenerateMan.return_value = 'testJSON' - mockCreateMan = mocker.patch.object(ChicagoISACProcess, 'createManifestInS3') + mock_generate_man = mocker.patch.object(ChicagoISACProcess, 'generate_manifest') + mock_generate_man.return_value = 'test_json' + mock_create_man = mocker.patch.object(ChicagoISACProcess, 'createManifestInS3') - testProcess.storePDFManifest(mockRecord) + test_process.store_pdf_manifest(mock_record) - testManifestURI = 'https://test_aws_bucket.s3.amazonaws.com/manifests/isac/1.json' - assert mockRecord.has_part[0] == '1|{}|isac|application/webpub+json|{{}}'.format(testManifestURI) + test_manifest_url = 'https://test_aws_bucket.s3.amazonaws.com/manifests/isac/1.json' + assert mock_record.has_part[0] == '1|{}|isac|application/webpub+json|{{}}'.format(test_manifest_url) - mockGenerateMan.assert_called_once_with(mockRecord, 'testURI', testManifestURI) - mockCreateMan.assert_called_once_with('manifests/isac/1.json', 'testJSON') + mock_generate_man.assert_called_once_with(mock_record, 'test_url', test_manifest_url) + mock_create_man.assert_called_once_with('manifests/isac/1.json', 'test_json') - def test_createManifestInS3(self, testProcess, mocker): - mockPut = mocker.patch.object(ChicagoISACProcess, 'putObjectInBucket') - - testProcess.createManifestInS3('testPath', '{"data": "testJSON"}') - - mockPut.assert_called_once_with(b'{"data": "testJSON"}', 'testPath', 'test_aws_bucket') - - def test_generateManifest(self, mocker): - mockManifest = mocker.MagicMock(links=[]) - mockManifest.toJson.return_value = 'testJSON' - mockManifestConstructor = mocker.patch('processes.ingest.chicago_isac.WebpubManifest') - mockManifestConstructor.return_value = mockManifest + def test_generate_manifest(self, mocker): + mock_manifest = mocker.MagicMock(links=[]) + mock_manifest.toJson.return_value = 'test_json' + mock_manifest_constructor = mocker.patch('processes.ingest.chicago_isac.WebpubManifest') + mock_manifest_constructor.return_value = mock_manifest - mockRecord = mocker.MagicMock(title='testTitle') - testManifest = ChicagoISACProcess.generateManifest(mockRecord, 'sourceURI', 'manifestURI') + mock_record = mocker.MagicMock(title='test_title') + test_manifest = ChicagoISACProcess.generate_manifest(mock_record, 'source_url', 'manifest_url') - assert testManifest == 'testJSON' - assert mockManifest.links[0] == {'rel': 'self', 'href': 'manifestURI', 'type': 'application/webpub+json'} + assert test_manifest == 'test_json' + assert mock_manifest.links[0] == {'rel': 'self', 'href': 'manifest_url', 'type': 'application/webpub+json'} - mockManifest.addMetadata.assert_called_once_with(mockRecord, conformsTo='test_profile_uri') - mockManifest.addChapter.assert_called_once_with('sourceURI', 'testTitle') \ No newline at end of file + mock_manifest.addMetadata.assert_called_once_with(mock_record, conformsTo='test_profile_uri') + mock_manifest.addChapter.assert_called_once_with('source_url', 'test_title')