From c300bda595a6be5e2ab49dc0f2f1a6ba8bdb5b91 Mon Sep 17 00:00:00 2001 From: kyle Date: Fri, 4 Oct 2024 13:30:16 -0400 Subject: [PATCH 1/2] SFR-2216: Fixing LOC process ingestion --- mappings/loc.py | 9 ++++++--- processes/loc.py | 21 +++++++++++++++++---- tests/unit/test_loc_process.py | 10 +++++++--- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/mappings/loc.py b/mappings/loc.py index 739ddcfb2fc..df6bdfc681f 100644 --- a/mappings/loc.py +++ b/mappings/loc.py @@ -33,9 +33,9 @@ def applyFormatting(self): self.record.source = 'loc' if self.record.medium: self.record.medium = self.record.medium[0] - if len(self.record.is_part_of) == 0: + if self.record.is_part_of and len(self.record.is_part_of) == 0: self.record.is_part_of = None - if len(self.record.abstract) == 0: + if self.record.abstract and len(self.record.abstract) == 0: self.record.abstract = None #Convert string repr of list to actual list @@ -62,6 +62,9 @@ def formatIdentifierSourceID(self, itemList): lccnNumber = self.record.identifiers[0][0] #lccnNumber comes in as an array and we need the string inside the array sourceID = lccnNumber if 'call_number' in newIdentifier.keys(): + if not isinstance(newIdentifier['call_number'], list): + newIdentifier['call_number'] = list(newIdentifier['call_number']) + newIdentifier['call_number'][0] = f'{newIdentifier["call_number"][0]}|call_number' callNumber = newIdentifier['call_number'][0].strip(' ') else: @@ -77,7 +80,7 @@ def formatPubSpatial(self, itemList): if ':' not in elem: createdPublishedList = elem.split(',', 1) pubLocation = createdPublishedList[0].strip(' ') - if ',' in createdPublishedList[1]: + if len(createdPublishedList) >= 2 and ',' in createdPublishedList[1]: pubOnly = createdPublishedList[1].split(',')[0].strip(' ') pubArray.append(pubOnly) spatialString = pubLocation diff --git a/processes/loc.py b/processes/loc.py index 59a4b223070..fb4e9965eb1 100644 --- a/processes/loc.py +++ b/processes/loc.py @@ -8,6 +8,7 @@ from managers import WebpubManifest from logger import createLog from datetime import datetime, timedelta, timezone +import traceback logger = createLog(__name__) @@ -51,6 +52,8 @@ def runProcess(self): self.saveRecords() self.commitChanges() + + logger.info(f'Ingested {len(self.records)} LOC records') def importLOCRecords(self, startTimeStamp=None): @@ -83,6 +86,9 @@ def importOpenAccessRecords(self, count, customTimeStamp): # An HTTP error will occur when the sp parameter value # passes the last page number of the collection search reuslts while sp < 100000: + if self.ingestLimit and count >= self.ingestLimit: + break + openAccessURL = '{}&sp={}'.format(LOC_ROOT_OPEN_ACCESS, sp) jsonData = self.fetchPageJSON(openAccessURL) LOCData = jsonData.json() @@ -129,6 +135,9 @@ def importDigitizedRecords(self, count, customTimeStamp): # An HTTP error will occur when the sp parameter value # passes the last page number of the collection search reuslts while sp < 100000: + if self.ingestLimit and count >= self.ingestLimit: + break + digitizedURL = '{}&sp={}'.format(LOC_ROOT_DIGIT, sp) jsonData = self.fetchPageJSON(digitizedURL) LOCData = jsonData.json() @@ -170,14 +179,18 @@ def processLOCRecord(self, record): try: LOCRec = LOCMapping(record) LOCRec.applyMapping() + + if LOCRec.record.authors is None: + logger.warning(f'Unable to map author in LOC record {LOCRec.record} ') + return + self.addHasPartMapping(record, LOCRec.record) self.storePDFManifest(LOCRec.record) self.storeEpubsInS3(LOCRec.record) self.addDCDWToUpdateList(LOCRec) - - except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e: - logger.exception(e) - logger.warn(LOCError('Unable to process LOC record')) + except Exception: + logger.info(f'{traceback.print_exc()}') + logger.exception(f'Unable to process LOC record') def addHasPartMapping(self, resultsRecord, record): if 'pdf' in resultsRecord['resources'][0].keys(): diff --git a/tests/unit/test_loc_process.py b/tests/unit/test_loc_process.py index 025beb13c9c..83240ff2145 100644 --- a/tests/unit/test_loc_process.py +++ b/tests/unit/test_loc_process.py @@ -1,6 +1,7 @@ import pytest from mappings.core import MappingError +from model import Record from processes.loc import LOCProcess from tests.helper import TestHelpers @@ -49,7 +50,10 @@ def test_processLOCRecord_success(self, testProcess, mocker): addDCDWToUpdateList=mocker.DEFAULT ) - mockMapping = mocker.MagicMock(record='testRecord') + test_record = Record(authors=[]) + + mockMapping = mocker.MagicMock() + mockMapping.record = test_record mockMapper = mocker.patch('processes.loc.LOCMapping') mockMapper.return_value = mockMapping @@ -57,8 +61,8 @@ def test_processLOCRecord_success(self, testProcess, mocker): mockMapping.applyMapping.assert_called_once() - processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, 'testRecord') - processMocks['storePDFManifest'].assert_called_once_with('testRecord') + processMocks['addHasPartMapping'].assert_called_once_with(mockMapping, test_record) + processMocks['storePDFManifest'].assert_called_once_with(test_record) processMocks['addDCDWToUpdateList'].assert_called_once_with(mockMapping) def test_processlocRecord_error(self, mocker): From 7da920dc4f1ff027ca0bbaa957b62da8fd4d7d25 Mon Sep 17 00:00:00 2001 From: kyle Date: Mon, 7 Oct 2024 10:13:09 -0400 Subject: [PATCH 2/2] removing traceback --- processes/loc.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/processes/loc.py b/processes/loc.py index fb4e9965eb1..8fb5c3be104 100644 --- a/processes/loc.py +++ b/processes/loc.py @@ -8,7 +8,6 @@ from managers import WebpubManifest from logger import createLog from datetime import datetime, timedelta, timezone -import traceback logger = createLog(__name__) @@ -189,7 +188,6 @@ def processLOCRecord(self, record): self.storeEpubsInS3(LOCRec.record) self.addDCDWToUpdateList(LOCRec) except Exception: - logger.info(f'{traceback.print_exc()}') logger.exception(f'Unable to process LOC record') def addHasPartMapping(self, resultsRecord, record):