Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-2216: Fixing LOC process ingestion #392

Merged
merged 2 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions mappings/loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def applyFormatting(self):
self.record.source = 'loc'
if self.record.medium:
self.record.medium = self.record.medium[0]
if len(self.record.is_part_of) == 0:
if self.record.is_part_of and len(self.record.is_part_of) == 0:
self.record.is_part_of = None
if len(self.record.abstract) == 0:
if self.record.abstract and len(self.record.abstract) == 0:
self.record.abstract = None

#Convert string repr of list to actual list
Expand All @@ -62,6 +62,9 @@ def formatIdentifierSourceID(self, itemList):
lccnNumber = self.record.identifiers[0][0] #lccnNumber comes in as an array and we need the string inside the array
sourceID = lccnNumber
if 'call_number' in newIdentifier.keys():
if not isinstance(newIdentifier['call_number'], list):
newIdentifier['call_number'] = list(newIdentifier['call_number'])

newIdentifier['call_number'][0] = f'{newIdentifier["call_number"][0]}|call_number'
callNumber = newIdentifier['call_number'][0].strip(' ')
else:
Expand All @@ -77,7 +80,7 @@ def formatPubSpatial(self, itemList):
if ':' not in elem:
createdPublishedList = elem.split(',', 1)
pubLocation = createdPublishedList[0].strip(' ')
if ',' in createdPublishedList[1]:
if len(createdPublishedList) >= 2 and ',' in createdPublishedList[1]:
pubOnly = createdPublishedList[1].split(',')[0].strip(' ')
pubArray.append(pubOnly)
spatialString = pubLocation
Expand Down
19 changes: 15 additions & 4 deletions processes/loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def runProcess(self):

self.saveRecords()
self.commitChanges()

logger.info(f'Ingested {len(self.records)} LOC records')


def importLOCRecords(self, startTimeStamp=None):
Expand Down Expand Up @@ -83,6 +85,9 @@ def importOpenAccessRecords(self, count, customTimeStamp):
# An HTTP error will occur when the sp parameter value
# passes the last page number of the collection search reuslts
while sp < 100000:
if self.ingestLimit and count >= self.ingestLimit:
break

openAccessURL = '{}&sp={}'.format(LOC_ROOT_OPEN_ACCESS, sp)
jsonData = self.fetchPageJSON(openAccessURL)
LOCData = jsonData.json()
Expand Down Expand Up @@ -129,6 +134,9 @@ def importDigitizedRecords(self, count, customTimeStamp):
# An HTTP error will occur when the sp parameter value
# passes the last page number of the collection search reuslts
while sp < 100000:
if self.ingestLimit and count >= self.ingestLimit:
break

digitizedURL = '{}&sp={}'.format(LOC_ROOT_DIGIT, sp)
jsonData = self.fetchPageJSON(digitizedURL)
LOCData = jsonData.json()
Expand Down Expand Up @@ -170,14 +178,17 @@ def processLOCRecord(self, record):
try:
LOCRec = LOCMapping(record)
LOCRec.applyMapping()

if LOCRec.record.authors is None:
logger.warning(f'Unable to map author in LOC record {LOCRec.record} ')
return

self.addHasPartMapping(record, LOCRec.record)
self.storePDFManifest(LOCRec.record)
self.storeEpubsInS3(LOCRec.record)
self.addDCDWToUpdateList(LOCRec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
logger.exception(e)
logger.warn(LOCError('Unable to process LOC record'))
except Exception:
logger.exception(f'Unable to process LOC record')

def addHasPartMapping(self, resultsRecord, record):
if 'pdf' in resultsRecord['resources'][0].keys():
Expand Down
10 changes: 7 additions & 3 deletions tests/unit/test_loc_process.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from mappings.core import MappingError
from model import Record
from processes.loc import LOCProcess
from tests.helper import TestHelpers

Expand Down Expand Up @@ -49,16 +50,19 @@ def test_processLOCRecord_success(self, testProcess, mocker):
addDCDWToUpdateList=mocker.DEFAULT
)

mockMapping = mocker.MagicMock(record='testRecord')
test_record = Record(authors=[])

mockMapping = mocker.MagicMock()
mockMapping.record = test_record
mockMapper = mocker.patch('processes.loc.LOCMapping')
mockMapper.return_value = mockMapping

testProcess.processLOCRecord(mockMapping)

mockMapping.applyMapping.assert_called_once()

processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, 'testRecord')
processMocks['storePDFManifest'].assert_called_once_with('testRecord')
processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, test_record)
processMocks['storePDFManifest'].assert_called_once_with(test_record)
processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping)

def test_processlocRecord_error(self, mocker):
Expand Down
Loading