Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-2265: Refactored Chicago ISAC mapping and processes #411

Merged
merged 5 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions managers/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ def createS3Bucket(self, bucketName, bucketPermissions):

raise S3Error('Unable to create bucket in s3')


def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

def putObjectInBucket(
self, obj, objKey, bucket, bucketPermissions='public-read'
):
Expand Down
1 change: 0 additions & 1 deletion mappings/chicagoISAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def createMapping(self):
def applyFormatting(self):
self.record.source = 'isac'

#Formatting for multiple isbns
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
if ',' in self.record.identifiers[0]:
identifierArray = self.record.identifiers[0].split(', ')
updatedIdentifierArray = []
Expand Down
96 changes: 44 additions & 52 deletions processes/ingest/chicago_isac.py
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import json
import os
from requests.exceptions import HTTPError, ConnectionError

from ..core import CoreProcess
from mappings.core import MappingError
from mappings.chicagoISAC import ChicagoISACMapping
from managers import WebpubManifest
from logger import createLog
Expand All @@ -15,95 +13,89 @@ class ChicagoISACProcess(CoreProcess):
def __init__(self, *args):
super(ChicagoISACProcess, self).__init__(*args[:4])

self.ingestOffset = int(args[5] or 0)
self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000
self.fullImport = self.process == 'complete'

# Connect to database
self.generateEngine()
self.createSession()

# S3 Configuration
self.s3Bucket = os.environ['FILE_BUCKET']
self.createS3Client()

def runProcess(self):
def runProcess(self):
with open('ingestJSONFiles/chicagoISAC_metadata.json') as f:
chicagoISACData = json.load(f)
chicago_isac_data = json.load(f)

for metaDict in chicagoISACData:
self.processChicagoISACRecord(metaDict)
for meta_dict in chicago_isac_data:
self.process_chicago_isac_record(meta_dict)

self.saveRecords()
self.commitChanges()

def processChicagoISACRecord(self, record):
logger.info(f'Ingested {len(self.records)} ISAC records')

def process_chicago_isac_record(self, record):
try:
chicagoISACRec = ChicagoISACMapping(record)
chicagoISACRec.applyMapping()
self.storePDFManifest(chicagoISACRec.record)
self.addDCDWToUpdateList(chicagoISACRec)
chicago_isac_rec = ChicagoISACMapping(record)
chicago_isac_rec.applyMapping()
self.store_pdf_manifest(chicago_isac_rec.record)
self.addDCDWToUpdateList(chicago_isac_rec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
logger.exception(e)
logger.warn(ChicagoISACError('Unable to process ISAC record'))
except Exception:
logger.exception(ChicagoISACError('Unable to process ISAC record'))


def storePDFManifest(self, record):
def store_pdf_manifest(self, record):
for link in record.has_part:
itemNo, uri, source, mediaType, flags = link[0].split('|')
item_no, uri, source, media_type, flags = link[0].split('|')

if mediaType == 'application/pdf':
recordID = record.identifiers[0].split('|')[0]
if media_type == 'application/pdf':
record_id = record.identifiers[0].split('|')[0]

manifestPath = 'manifests/{}/{}.json'.format(source, recordID)
manifestURI = 'https://{}.s3.amazonaws.com/{}'.format(
self.s3Bucket, manifestPath
manifest_path = 'manifests/{}/{}.json'.format(source, record_id)
manifest_url = 'https://{}.s3.amazonaws.com/{}'.format(
self.s3Bucket, manifest_path
)

manifestJSON = self.generateManifest(record, uri, manifestURI)
manifest_json = self.generate_manifest(record, uri, manifest_url)

self.createManifestInS3(manifestPath, manifestJSON)
self.createManifestInS3(manifest_path, manifest_json)

linkString = '|'.join([
itemNo,
manifestURI,
link_string = '|'.join([
item_no,
manifest_url,
source,
'application/webpub+json',
flags
])
record.has_part.insert(0, linkString)
#The second element of the has_part array will always be an array of pdfURLS from the json file
#Those elements of the array are popped out and appended back into the has_part array as strings instead of an array
for i in range(1, len(record.has_part)):
if len(record.has_part[i]) > 1:
urlArray = record.has_part[i]
record.has_part.pop(i)
for elem in urlArray:
record.has_part.append(elem)
else:
record.has_part[i] = ''.join(record.has_part[i])
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)
record.has_part.insert(0, link_string)
self.change_has_part_url_array_to_string(record)

break

@staticmethod
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
def change_has_part_url_array_to_string(record):
for i in range(1, len(record.has_part)):
if len(record.has_part[i]) > 1:
url_array = record.has_part[i]
record.has_part.pop(i)
for elem in url_array:
record.has_part.append(elem)
else:
record.has_part[i] = ''.join(record.has_part[i])

@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')
def generate_manifest(record, source_url, manifest_url):
manifest = WebpubManifest(source_url, 'application/pdf')

manifest.addMetadata(
record,
conformsTo=os.environ['WEBPUB_PDF_PROFILE']
)

manifest.addChapter(sourceURI, record.title)
manifest.addChapter(source_url, record.title)

manifest.links.append({
'rel': 'self',
'href': manifestURI,
'href': manifest_url,
'type': 'application/webpub+json'
})

Expand Down
3 changes: 0 additions & 3 deletions processes/ingest/doab.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,6 @@ def downloadOAIRecords(self, fullOrPartial, startTimestamp, resumptionToken=None

raise DOABError(f'Received {doabResponse.status_code} status code from {doabURL}')

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket)

def sendFileToProcessingQueue(self, fileURL, s3Location):
s3Message = {
'fileData': {
Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,6 @@ def storeEpubsInS3(self, record):
self.sendFileToProcessingQueue(uri, bucketLocation)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

def addEPUBManifest(self, record, itemNo, source, flagStr, mediaType, location):
s3URL = 'https://{}.s3.amazonaws.com/{}'.format(self.s3Bucket, location)

Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/met.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,6 @@ def storePDFManifest(self, record):

break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')
Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/u_of_m.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,6 @@ def storePDFManifest(self, record):
record.has_part.insert(0, linkString)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')
Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/u_of_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ def storePDFManifest(self, record):
record.has_part.insert(0, linkString)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')
Expand Down
99 changes: 46 additions & 53 deletions tests/unit/processes/ingest/test_chicago_isac_process.py
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -14,91 +14,84 @@ def teardown_class(cls):
TestHelpers.clearEnvVars()

@pytest.fixture
def testProcess(self, mocker):
def test_process(self, mocker):
class TestISAC(ChicagoISACProcess):
def __init__(self):
self.s3Bucket = 'test_aws_bucket'
self.s3Client = mocker.MagicMock(s3Client='testS3Client')
self.session = mocker.MagicMock(session='testSession')
self.records = mocker.MagicMock(record='testRecord')
self.batchSize = mocker.MagicMock(batchSize='testBatchSize')
self.s3_client = mocker.MagicMock(s3_client='test_s3_client')
self.session = mocker.MagicMock(session='test_session')
self.records = mocker.MagicMock(record='test_record')
self.batch_size = mocker.MagicMock(batch_size='test_batch_size')

return TestISAC()

def test_runProcess(self, testProcess, mocker):
runMocks = mocker.patch.multiple(
def test_run_process(self, test_process, mocker):
run_mocks = mocker.patch.multiple(
ChicagoISACProcess,
saveRecords=mocker.DEFAULT,
commitChanges=mocker.DEFAULT
)

testProcess.runProcess()
test_process.runProcess()

runMocks['saveRecords'].assert_called_once()
runMocks['commitChanges'].assert_called_once()
run_mocks['saveRecords'].assert_called_once()
run_mocks['commitChanges'].assert_called_once()


def test_processChicagoISACRecord_success(self, testProcess, mocker):
def test_process_chicago_isac_record_success(self, test_process, mocker):
processMocks = mocker.patch.multiple(ChicagoISACProcess,
storePDFManifest=mocker.DEFAULT,
store_pdf_manifest=mocker.DEFAULT,
addDCDWToUpdateList=mocker.DEFAULT
)

mockMapping = mocker.MagicMock(record='testRecord')
mockMapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping')
mockMapper.return_value = mockMapping
mock_mapping = mocker.MagicMock(record='test_record')
mock_mapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping')
mock_mapper.return_value = mock_mapping

testProcess.processChicagoISACRecord(mockMapping)
test_process.process_chicago_isac_record(mock_mapping)

mockMapping.applyMapping.assert_called_once()
mock_mapping.applyMapping.assert_called_once()

processMocks['storePDFManifest'].assert_called_once_with('testRecord')
processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping)
processMocks['store_pdf_manifest'].assert_called_once_with('test_record')
processMocks['addDCDWToUpdateList'].assert_called_once_with(mock_mapping)

def test_processChicagoISACRecord_error(self, mocker):
def test_process_chicago_isac_record_error(self, mocker):

mockMapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping')
mockMapper.side_effect = MappingError('testError')
mock_mapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping')
mock_mapper.side_effect = MappingError('testError')

assert pytest.raises(MappingError)

def test_storePDFManifest(self, testProcess, mocker):
mockRecord = mocker.MagicMock(identifiers=['1|isac'])
mockRecord.has_part = [
['1|testURI|isac|application/pdf|{}',
'2|testURIOther|isac|application/pdf|{}'],
def test_store_pdf_manifest(self, test_process, mocker):
mock_record = mocker.MagicMock(identifiers=['1|isac'])
mock_record.has_part = [
['1|test_url|isac|application/pdf|{}',
'2|test_url_other|isac|application/pdf|{}'],
]

mockGenerateMan = mocker.patch.object(ChicagoISACProcess, 'generateManifest')
mockGenerateMan.return_value = 'testJSON'
mockCreateMan = mocker.patch.object(ChicagoISACProcess, 'createManifestInS3')
mock_generate_man = mocker.patch.object(ChicagoISACProcess, 'generate_manifest')
mock_generate_man.return_value = 'test_json'
mock_create_man = mocker.patch.object(ChicagoISACProcess, 'createManifestInS3')

testProcess.storePDFManifest(mockRecord)
test_process.store_pdf_manifest(mock_record)

testManifestURI = 'https://test_aws_bucket.s3.amazonaws.com/manifests/isac/1.json'
assert mockRecord.has_part[0] == '1|{}|isac|application/webpub+json|{{}}'.format(testManifestURI)
test_manifest_url = 'https://test_aws_bucket.s3.amazonaws.com/manifests/isac/1.json'
assert mock_record.has_part[0] == '1|{}|isac|application/webpub+json|{{}}'.format(test_manifest_url)

mockGenerateMan.assert_called_once_with(mockRecord, 'testURI', testManifestURI)
mockCreateMan.assert_called_once_with('manifests/isac/1.json', 'testJSON')
mock_generate_man.assert_called_once_with(mock_record, 'test_url', test_manifest_url)
mock_create_man.assert_called_once_with('manifests/isac/1.json', 'test_json')

def test_createManifestInS3(self, testProcess, mocker):
mockPut = mocker.patch.object(ChicagoISACProcess, 'putObjectInBucket')

testProcess.createManifestInS3('testPath', '{"data": "testJSON"}')

mockPut.assert_called_once_with(b'{"data": "testJSON"}', 'testPath', 'test_aws_bucket')

def test_generateManifest(self, mocker):
mockManifest = mocker.MagicMock(links=[])
mockManifest.toJson.return_value = 'testJSON'
mockManifestConstructor = mocker.patch('processes.ingest.chicago_isac.WebpubManifest')
mockManifestConstructor.return_value = mockManifest
def test_generate_manifest(self, mocker):
mock_manifest = mocker.MagicMock(links=[])
mock_manifest.toJson.return_value = 'test_json'
mock_manifest_constructor = mocker.patch('processes.ingest.chicago_isac.WebpubManifest')
mock_manifest_constructor.return_value = mock_manifest

mockRecord = mocker.MagicMock(title='testTitle')
testManifest = ChicagoISACProcess.generateManifest(mockRecord, 'sourceURI', 'manifestURI')
mock_record = mocker.MagicMock(title='test_title')
test_manifest = ChicagoISACProcess.generate_manifest(mock_record, 'source_url', 'manifest_url')

assert testManifest == 'testJSON'
assert mockManifest.links[0] == {'rel': 'self', 'href': 'manifestURI', 'type': 'application/webpub+json'}
assert test_manifest == 'test_json'
assert mock_manifest.links[0] == {'rel': 'self', 'href': 'manifest_url', 'type': 'application/webpub+json'}

mockManifest.addMetadata.assert_called_once_with(mockRecord, conformsTo='test_profile_uri')
mockManifest.addChapter.assert_called_once_with('sourceURI', 'testTitle')
mock_manifest.addMetadata.assert_called_once_with(mock_record, conformsTo='test_profile_uri')
mock_manifest.addChapter.assert_called_once_with('source_url', 'test_title')
Loading