Skip to content

Commit

Permalink
SFR-2216: Fixing LOC process ingestion (#392)
Browse files Browse the repository at this point in the history
  • Loading branch information
kylevillegas93 authored Oct 7, 2024
1 parent 596efd4 commit 4d7a0aa
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 10 deletions.
9 changes: 6 additions & 3 deletions mappings/loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def applyFormatting(self):
self.record.source = 'loc'
if self.record.medium:
self.record.medium = self.record.medium[0]
if len(self.record.is_part_of) == 0:
if self.record.is_part_of and len(self.record.is_part_of) == 0:
self.record.is_part_of = None
if len(self.record.abstract) == 0:
if self.record.abstract and len(self.record.abstract) == 0:
self.record.abstract = None

#Convert string repr of list to actual list
Expand All @@ -62,6 +62,9 @@ def formatIdentifierSourceID(self, itemList):
lccnNumber = self.record.identifiers[0][0] #lccnNumber comes in as an array and we need the string inside the array
sourceID = lccnNumber
if 'call_number' in newIdentifier.keys():
if not isinstance(newIdentifier['call_number'], list):
newIdentifier['call_number'] = list(newIdentifier['call_number'])

newIdentifier['call_number'][0] = f'{newIdentifier["call_number"][0]}|call_number'
callNumber = newIdentifier['call_number'][0].strip(' ')
else:
Expand All @@ -77,7 +80,7 @@ def formatPubSpatial(self, itemList):
if ':' not in elem:
createdPublishedList = elem.split(',', 1)
pubLocation = createdPublishedList[0].strip(' ')
if ',' in createdPublishedList[1]:
if len(createdPublishedList) >= 2 and ',' in createdPublishedList[1]:
pubOnly = createdPublishedList[1].split(',')[0].strip(' ')
pubArray.append(pubOnly)
spatialString = pubLocation
Expand Down
19 changes: 15 additions & 4 deletions processes/loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def runProcess(self):

self.saveRecords()
self.commitChanges()

logger.info(f'Ingested {len(self.records)} LOC records')


def importLOCRecords(self, startTimeStamp=None):
Expand Down Expand Up @@ -83,6 +85,9 @@ def importOpenAccessRecords(self, count, customTimeStamp):
# An HTTP error will occur when the sp parameter value
# passes the last page number of the collection search reuslts
while sp < 100000:
if self.ingestLimit and count >= self.ingestLimit:
break

openAccessURL = '{}&sp={}'.format(LOC_ROOT_OPEN_ACCESS, sp)
jsonData = self.fetchPageJSON(openAccessURL)
LOCData = jsonData.json()
Expand Down Expand Up @@ -129,6 +134,9 @@ def importDigitizedRecords(self, count, customTimeStamp):
# An HTTP error will occur when the sp parameter value
# passes the last page number of the collection search reuslts
while sp < 100000:
if self.ingestLimit and count >= self.ingestLimit:
break

digitizedURL = '{}&sp={}'.format(LOC_ROOT_DIGIT, sp)
jsonData = self.fetchPageJSON(digitizedURL)
LOCData = jsonData.json()
Expand Down Expand Up @@ -170,14 +178,17 @@ def processLOCRecord(self, record):
try:
LOCRec = LOCMapping(record)
LOCRec.applyMapping()

if LOCRec.record.authors is None:
logger.warning(f'Unable to map author in LOC record {LOCRec.record} ')
return

self.addHasPartMapping(record, LOCRec.record)
self.storePDFManifest(LOCRec.record)
self.storeEpubsInS3(LOCRec.record)
self.addDCDWToUpdateList(LOCRec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
logger.exception(e)
logger.warn(LOCError('Unable to process LOC record'))
except Exception:
logger.exception(f'Unable to process LOC record')

def addHasPartMapping(self, resultsRecord, record):
if 'pdf' in resultsRecord['resources'][0].keys():
Expand Down
10 changes: 7 additions & 3 deletions tests/unit/test_loc_process.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from mappings.core import MappingError
from model import Record
from processes.loc import LOCProcess
from tests.helper import TestHelpers

Expand Down Expand Up @@ -49,16 +50,19 @@ def test_processLOCRecord_success(self, testProcess, mocker):
addDCDWToUpdateList=mocker.DEFAULT
)

mockMapping = mocker.MagicMock(record='testRecord')
test_record = Record(authors=[])

mockMapping = mocker.MagicMock()
mockMapping.record = test_record
mockMapper = mocker.patch('processes.loc.LOCMapping')
mockMapper.return_value = mockMapping

testProcess.processLOCRecord(mockMapping)

mockMapping.applyMapping.assert_called_once()

processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, 'testRecord')
processMocks['storePDFManifest'].assert_called_once_with('testRecord')
processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, test_record)
processMocks['storePDFManifest'].assert_called_once_with(test_record)
processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping)

def test_processlocRecord_error(self, mocker):
Expand Down

0 comments on commit 4d7a0aa

Please sign in to comment.