diff --git a/mappings/loc.py b/mappings/loc.py index 739ddcfb2f..df6bdfc681 100644 --- a/mappings/loc.py +++ b/mappings/loc.py @@ -33,9 +33,9 @@ def applyFormatting(self): self.record.source = 'loc' if self.record.medium: self.record.medium = self.record.medium[0] - if len(self.record.is_part_of) == 0: + if self.record.is_part_of and len(self.record.is_part_of) == 0: self.record.is_part_of = None - if len(self.record.abstract) == 0: + if self.record.abstract and len(self.record.abstract) == 0: self.record.abstract = None #Convert string repr of list to actual list @@ -62,6 +62,9 @@ def formatIdentifierSourceID(self, itemList): lccnNumber = self.record.identifiers[0][0] #lccnNumber comes in as an array and we need the string inside the array sourceID = lccnNumber if 'call_number' in newIdentifier.keys(): + if not isinstance(newIdentifier['call_number'], list): + newIdentifier['call_number'] = list(newIdentifier['call_number']) + newIdentifier['call_number'][0] = f'{newIdentifier["call_number"][0]}|call_number' callNumber = newIdentifier['call_number'][0].strip(' ') else: @@ -77,7 +80,7 @@ def formatPubSpatial(self, itemList): if ':' not in elem: createdPublishedList = elem.split(',', 1) pubLocation = createdPublishedList[0].strip(' ') - if ',' in createdPublishedList[1]: + if len(createdPublishedList) >= 2 and ',' in createdPublishedList[1]: pubOnly = createdPublishedList[1].split(',')[0].strip(' ') pubArray.append(pubOnly) spatialString = pubLocation diff --git a/processes/loc.py b/processes/loc.py index 59a4b22307..8fb5c3be10 100644 --- a/processes/loc.py +++ b/processes/loc.py @@ -51,6 +51,8 @@ def runProcess(self): self.saveRecords() self.commitChanges() + + logger.info(f'Ingested {len(self.records)} LOC records') def importLOCRecords(self, startTimeStamp=None): @@ -83,6 +85,9 @@ def importOpenAccessRecords(self, count, customTimeStamp): # An HTTP error will occur when the sp parameter value # passes the last page number of the collection search reuslts while sp < 100000: + if self.ingestLimit and count >= self.ingestLimit: + break + openAccessURL = '{}&sp={}'.format(LOC_ROOT_OPEN_ACCESS, sp) jsonData = self.fetchPageJSON(openAccessURL) LOCData = jsonData.json() @@ -129,6 +134,9 @@ def importDigitizedRecords(self, count, customTimeStamp): # An HTTP error will occur when the sp parameter value # passes the last page number of the collection search reuslts while sp < 100000: + if self.ingestLimit and count >= self.ingestLimit: + break + digitizedURL = '{}&sp={}'.format(LOC_ROOT_DIGIT, sp) jsonData = self.fetchPageJSON(digitizedURL) LOCData = jsonData.json() @@ -170,14 +178,17 @@ def processLOCRecord(self, record): try: LOCRec = LOCMapping(record) LOCRec.applyMapping() + + if LOCRec.record.authors is None: + logger.warning(f'Unable to map author in LOC record {LOCRec.record} ') + return + self.addHasPartMapping(record, LOCRec.record) self.storePDFManifest(LOCRec.record) self.storeEpubsInS3(LOCRec.record) self.addDCDWToUpdateList(LOCRec) - - except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e: - logger.exception(e) - logger.warn(LOCError('Unable to process LOC record')) + except Exception: + logger.exception(f'Unable to process LOC record') def addHasPartMapping(self, resultsRecord, record): if 'pdf' in resultsRecord['resources'][0].keys(): diff --git a/tests/unit/test_loc_process.py b/tests/unit/test_loc_process.py index 025beb13c9..83240ff214 100644 --- a/tests/unit/test_loc_process.py +++ b/tests/unit/test_loc_process.py @@ -1,6 +1,7 @@ import pytest from mappings.core import MappingError +from model import Record from processes.loc import LOCProcess from tests.helper import TestHelpers @@ -49,7 +50,10 @@ def test_processLOCRecord_success(self, testProcess, mocker): addDCDWToUpdateList=mocker.DEFAULT ) - mockMapping = mocker.MagicMock(record='testRecord') + test_record = Record(authors=[]) + + mockMapping = mocker.MagicMock() + mockMapping.record = test_record mockMapper = mocker.patch('processes.loc.LOCMapping') mockMapper.return_value = mockMapping @@ -57,8 +61,8 @@ def test_processLOCRecord_success(self, testProcess, mocker): mockMapping.applyMapping.assert_called_once() - processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, 'testRecord') - processMocks['storePDFManifest'].assert_called_once_with('testRecord') + processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, test_record) + processMocks['storePDFManifest'].assert_called_once_with(test_record) processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping) def test_processlocRecord_error(self, mocker):