Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-2265: Refactored Chicago ISAC mapping and processes #411

Merged
merged 5 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions managers/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ def createS3Bucket(self, bucketName, bucketPermissions):

raise S3Error('Unable to create bucket in s3')


def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

def putObjectInBucket(
self, obj, objKey, bucket, bucketPermissions='public-read'
):
Expand Down
1 change: 0 additions & 1 deletion mappings/chicagoISAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def createMapping(self):
def applyFormatting(self):
self.record.source = 'isac'

#Formatting for multiple isbns
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
if ',' in self.record.identifiers[0]:
identifierArray = self.record.identifiers[0].split(', ')
updatedIdentifierArray = []
Expand Down
58 changes: 26 additions & 32 deletions processes/ingest/chicago_isac.py
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import json
import os
from requests.exceptions import HTTPError, ConnectionError

from ..core import CoreProcess
from mappings.core import MappingError
from mappings.chicagoISAC import ChicagoISACMapping
from managers import WebpubManifest
from logger import createLog
Expand All @@ -15,41 +13,38 @@ class ChicagoISACProcess(CoreProcess):
def __init__(self, *args):
super(ChicagoISACProcess, self).__init__(*args[:4])

self.ingestOffset = int(args[5] or 0)
self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000
self.fullImport = self.process == 'complete'

# Connect to database
self.generateEngine()
self.createSession()

# S3 Configuration
self.s3Bucket = os.environ['FILE_BUCKET']
self.createS3Client()

def runProcess(self):
def run_process(self):
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
with open('ingestJSONFiles/chicagoISAC_metadata.json') as f:
chicagoISACData = json.load(f)
chicagoISACData = json.load(f)

for metaDict in chicagoISACData:
self.processChicagoISACRecord(metaDict)
self.process_chicago_isac_record(metaDict)

self.saveRecords()
self.commitChanges()

def processChicagoISACRecord(self, record):
logger.info(f'Ingested {len(self.records)} ISAC records')

def process_chicago_isac_record(self, record):
try:
chicagoISACRec = ChicagoISACMapping(record)
chicagoISACRec.applyMapping()
self.storePDFManifest(chicagoISACRec.record)
self.store_pdf_manifest(chicagoISACRec.record)
self.addDCDWToUpdateList(chicagoISACRec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
logger.exception(e)
logger.warn(ChicagoISACError('Unable to process ISAC record'))
except Exception:
logger.exception(ChicagoISACError('Unable to process ISAC record'))


def storePDFManifest(self, record):
def store_pdf_manifest(self, record):
for link in record.has_part:
itemNo, uri, source, mediaType, flags = link[0].split('|')

Expand All @@ -61,7 +56,7 @@ def storePDFManifest(self, record):
self.s3Bucket, manifestPath
)

manifestJSON = self.generateManifest(record, uri, manifestURI)
manifestJSON = self.generate_manifest(record, uri, manifestURI)

self.createManifestInS3(manifestPath, manifestJSON)

Expand All @@ -72,26 +67,25 @@ def storePDFManifest(self, record):
'application/webpub+json',
flags
])

record.has_part.insert(0, linkString)
#The second element of the has_part array will always be an array of pdfURLS from the json file
#Those elements of the array are popped out and appended back into the has_part array as strings instead of an array
for i in range(1, len(record.has_part)):
if len(record.has_part[i]) > 1:
urlArray = record.has_part[i]
record.has_part.pop(i)
for elem in urlArray:
record.has_part.append(elem)
else:
record.has_part[i] = ''.join(record.has_part[i])
break
self.change_has_part_url_array_to_string(record)

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)
break

@staticmethod
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
def change_has_part_url_array_to_string(record):
for i in range(1, len(record.has_part)):
if len(record.has_part[i]) > 1:
urlArray = record.has_part[i]
record.has_part.pop(i)
for elem in urlArray:
record.has_part.append(elem)
else:
record.has_part[i] = ''.join(record.has_part[i])

@staticmethod
def generateManifest(record, sourceURI, manifestURI):
def generate_manifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')

manifest.addMetadata(
Expand Down
3 changes: 0 additions & 3 deletions processes/ingest/doab.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,6 @@ def downloadOAIRecords(self, fullOrPartial, startTimestamp, resumptionToken=None

raise DOABError(f'Received {doabResponse.status_code} status code from {doabURL}')

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket)

def sendFileToProcessingQueue(self, fileURL, s3Location):
s3Message = {
'fileData': {
Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,6 @@ def storeEpubsInS3(self, record):
self.sendFileToProcessingQueue(uri, bucketLocation)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

def addEPUBManifest(self, record, itemNo, source, flagStr, mediaType, location):
s3URL = 'https://{}.s3.amazonaws.com/{}'.format(self.s3Bucket, location)

Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/met.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,6 @@ def storePDFManifest(self, record):

break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')
Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/u_of_m.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,6 @@ def storePDFManifest(self, record):
record.has_part.insert(0, linkString)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')
Expand Down
5 changes: 0 additions & 5 deletions processes/ingest/u_of_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ def storePDFManifest(self, record):
record.has_part.insert(0, linkString)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')
Expand Down
16 changes: 8 additions & 8 deletions tests/unit/processes/ingest/test_chicago_isac_process.py
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,27 @@ def test_runProcess(self, testProcess, mocker):
commitChanges=mocker.DEFAULT
)

testProcess.runProcess()
testProcess.run_process()

runMocks['saveRecords'].assert_called_once()
runMocks['commitChanges'].assert_called_once()


def test_processChicagoISACRecord_success(self, testProcess, mocker):
processMocks = mocker.patch.multiple(ChicagoISACProcess,
storePDFManifest=mocker.DEFAULT,
store_pdf_manifest=mocker.DEFAULT,
addDCDWToUpdateList=mocker.DEFAULT
)

mockMapping = mocker.MagicMock(record='testRecord')
mockMapper = mocker.patch('processes.ingest.chicago_isac.ChicagoISACMapping')
mockMapper.return_value = mockMapping

testProcess.processChicagoISACRecord(mockMapping)
testProcess.process_chicago_isac_record(mockMapping)

mockMapping.applyMapping.assert_called_once()

processMocks['storePDFManifest'].assert_called_once_with('testRecord')
processMocks['store_pdf_manifest'].assert_called_once_with('testRecord')
processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping)

def test_processChicagoISACRecord_error(self, mocker):
Expand All @@ -62,18 +62,18 @@ def test_processChicagoISACRecord_error(self, mocker):

assert pytest.raises(MappingError)

def test_storePDFManifest(self, testProcess, mocker):
def test_store_pdf_manifest(self, testProcess, mocker):
mockRecord = mocker.MagicMock(identifiers=['1|isac'])
mockRecord.has_part = [
['1|testURI|isac|application/pdf|{}',
'2|testURIOther|isac|application/pdf|{}'],
]

mockGenerateMan = mocker.patch.object(ChicagoISACProcess, 'generateManifest')
mockGenerateMan = mocker.patch.object(ChicagoISACProcess, 'generate_manifest')
mockGenerateMan.return_value = 'testJSON'
mockCreateMan = mocker.patch.object(ChicagoISACProcess, 'createManifestInS3')

testProcess.storePDFManifest(mockRecord)
testProcess.store_pdf_manifest(mockRecord)

testManifestURI = 'https://test_aws_bucket.s3.amazonaws.com/manifests/isac/1.json'
assert mockRecord.has_part[0] == '1|{}|isac|application/webpub+json|{{}}'.format(testManifestURI)
Expand All @@ -95,7 +95,7 @@ def test_generateManifest(self, mocker):
mockManifestConstructor.return_value = mockManifest

mockRecord = mocker.MagicMock(title='testTitle')
testManifest = ChicagoISACProcess.generateManifest(mockRecord, 'sourceURI', 'manifestURI')
testManifest = ChicagoISACProcess.generate_manifest(mockRecord, 'sourceURI', 'manifestURI')

assert testManifest == 'testJSON'
assert mockManifest.links[0] == {'rel': 'self', 'href': 'manifestURI', 'type': 'application/webpub+json'}
Expand Down
Loading