Merge pull request #285 from NYPL/SFR-1775_UofMichiganIngestion

SFR-1775_University Michigan Ingestion
NYPL · Jan 31, 2024 · 016ddc8 · 016ddc8
2 parents 3a31c81 + bb3e743
commit 016ddc8
Show file tree

Hide file tree

Showing 10 changed files with 6,544 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,4 @@ config/local*
 tags
 # Vim
 *.swp
-/*venv*
+/*venv*
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,14 @@
 # CHANGELOG
-## unreleased version -- v0.12.4
-## Added
-New /fulfill endpoint with ability to check for NYPL login in Bearer authorization header
-Fulfill endpoint returns pre-signed URLs for objects in private buckets when user is logged in
 
 ## unreleased version -- v0.12.4
 ## Added
 - New script to add nypl_login flag to Links objects
 - Added nypl_login flag to nypl mapping
 - New APIUtils method to generate a presigned url for S3 actions
+- New /fulfill endpoint with ability to check for NYPL login in Bearer authorization header
+- Fulfill endpoint returns pre-signed URLs for objects in private buckets when user is logged in
+- Added new University of Michigan process and mapping for ingestion
+- New directory for test JSON files that will be ingested
 ## Fixed
 - NYPL records not being added due to SQLAlchemy error
 - Bardo CCE API and Hathi DataFiles URL updated

diff --git a/ingestJSONFiles/UofM_CSV.json b/ingestJSONFiles/UofM_CSV.json
diff --git a/chicagoISAC_metadata.json → ingestJSONFiles/chicagoISAC_metadata.json b/chicagoISAC_metadata.json → ingestJSONFiles/chicagoISAC_metadata.json
diff --git a/mappings/UofM.py b/mappings/UofM.py
@@ -0,0 +1,81 @@
+from .json import JSONMapping
+import logging
+
+class UofMMapping(JSONMapping):
+    def __init__(self, source):
+        super().__init__(source, {})
+        self.mapping = self.createMapping() 
+
+    def createMapping(self):
+        return {
+            'title': ('Title', '{0}'),
+            'authors': ('Author(s)', '{0}'),
+            'dates': [('Pub Date', '{0}|publication_date')],
+            'publisher': [('Publisher (from Projects)', '{0}||')],
+            'identifiers': [
+                ('ISBN', '{0}|isbn'),
+                ('OCLC', '{0}|oclc')
+            ],
+            'contributors': [('Contributors', '{0}|||contributor')],
+            'subjects': ('Subject 1', '{0}'),
+        }
+
+    def applyFormatting(self):
+        self.record.has_part = []
+        self.record.spatial = 'Michigan'
+        self.record.source = 'UofM'
+
+        if self.record.authors:
+            self.record.authors = self.formatAuthors()
+
+        if self.record.subjects:
+            self.record.subjects = self.formatSubjects()
+
+        if self.record.identifiers:
+            if len(self.record.identifiers) == 1:
+                source_id = self.record.identifiers[0].split('|')[0]
+            else:
+                source_id = self.record.identifiers[1].split('|')[0]
+
+            self.record.source_id = f'UofM_{source_id}'
+            self.record.identifiers = self.formatIdentifiers()
+
+    def formatAuthors(self):
+        authorList = []
+
+        if ';' in self.record.authors:
+            authorList = self.record.authors.split('; ')
+            newAuthorList = [f'{author}|||true' for author in authorList] 
+            return newAuthorList
+        else:
+            authorList.append(f'{self.record.authors}|||true)')
+            return authorList
+
+    def formatSubjects(self):
+        subjectList = []
+
+        if '|' in self.record.subjects:
+            subjectList = self.record.subjects.split('|')
+            newSubjectList = [f'{subject}||' for subject in subjectList] 
+            return newSubjectList
+        else:
+            subjectList.append(f'{self.record.subjects}||')
+            return subjectList
+
+    def formatIdentifiers(self):
+        if 'isbn' in self.record.identifiers[0]:
+            isbnString = self.record.identifiers[0].split('|')[0]
+            if ';' in isbnString:
+                isbnList = isbnString.split('; ')
+                newISBNList = [f'{isbn}|isbn' for isbn in isbnList]
+                if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]:
+                    newISBNList.append(f'{self.record.identifiers[1]}')
+                    return newISBNList
+                else:
+                    return newISBNList
+
+        return self.record.identifiers
+
+
+
+
diff --git a/processes/UofM.py b/processes/UofM.py
@@ -0,0 +1,159 @@
+import json
+import os
+from requests.exceptions import HTTPError, ConnectionError
+from botocore.exceptions import ClientError
+
+from .core import CoreProcess
+from urllib.error import HTTPError
+from mappings.core import MappingError
+from mappings.UofM import UofMMapping
+from managers import WebpubManifest
+from logger import createLog
+
+logger = createLog(__name__)
+
+class UofMProcess(CoreProcess):
+
+    def __init__(self, *args):
+        super(UofMProcess, self).__init__(*args[:4])
+
+        self.ingestOffset = int(args[5] or 0)
+        self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000
+        self.fullImport = self.process == 'complete' 
+
+        # Connect to database
+        self.generateEngine()
+        self.createSession()
+
+        # S3 Configuration
+        self.s3Bucket = os.environ['FILE_BUCKET']
+        self.createS3Client()
+
+    def runProcess(self):
+        with open('ingestJSONFiles/UofM_CSV.json') as f:
+                UofMData = json.load(f)
+
+        for i in range(0, len(UofMData['data'])):
+            metaDict = UofMData['data'][i]
+            self.processUofMRecord(metaDict)
+
+        self.saveRecords()
+        self.commitChanges()
+
+    def processUofMRecord(self, record):
+        try:
+            UofMRec = UofMMapping(record)
+            UofMRec.applyMapping()
+            self.addHasPartMapping(record, UofMRec.record)
+            self.storePDFManifest(UofMRec.record)
+            self.addDCDWToUpdateList(UofMRec)
+
+        except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
+            logger.exception(e)
+            logger.warn(UofMError('Unable to process UofM record'))
+
+    def addHasPartMapping(self, resultsRecord, record):
+        bucket = 'ump-pdf-repository'
+
+        try:
+            #The get_object method is to make sure the object with a specific bucket and key exists in S3
+            self.s3Client.get_object(Bucket=bucket, 
+                                    Key=f'{resultsRecord["File ID 1"]}_060pct.pdf')
+            key = f'{resultsRecord["File ID 1"]}_060pct.pdf'
+            urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'
+
+            linkString = '|'.join([
+                '1',
+                urlPDFObject,
+                'UofM',
+                'application/pdf',
+                '{"catalog": false, "download": true, "reader": false, "embed": false}'
+            ])
+            record.has_part.append(linkString)
+
+        except ClientError or Exception or HTTPError as err:
+            if err.response['Error']['Code'] == 'NoSuchKey':
+                logger.info(UofMError("Key doesn't exist"))
+            else: 
+                logger.info(UofMError("Object doesn't exist"))
+
+        if not record.has_part:
+            try:
+                #The get_object method is to make sure the object with a specific bucket and key exists in S3
+                self.s3Client.get_object(Bucket= 'ump-pdf-repository', 
+                                        Key= f'{resultsRecord["File ID 1"]}_100pct.pdf')
+                key = f'{resultsRecord["File ID 1"]}_100pct.pdf'
+                urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'
+
+                linkString = '|'.join([
+                    '1',
+                    urlPDFObject,
+                    'UofM',
+                    'application/pdf',
+                    '{"catalog": false, "download": true, "reader": false, "embed": false}'
+                ])
+                record.has_part.append(linkString)
+
+            except ClientError or Exception or HTTPError as err:
+                if err.response['Error']['Code'] == 'NoSuchKey':
+                    logger.info(UofMError("Key doesn't exist"))
+                else: 
+                    logger.info(UofMError("Object doesn't exist"))
+
+
+
+    def storePDFManifest(self, record):
+        for link in record.has_part:
+            itemNo, uri, source, mediaType, flags = link.split('|')
+
+            if mediaType == 'application/pdf':
+                logger.warn(f'Identifiers: {record.identifiers}')
+                recordID = record.identifiers[0].split('|')[0]
+
+                manifestPath = 'manifests/{}/{}.json'.format(source, recordID)
+                manifestURI = 'https://{}.s3.amazonaws.com/{}'.format(
+                    self.s3Bucket, manifestPath
+                )
+
+                manifestJSON = self.generateManifest(record, uri, manifestURI)
+
+                self.createManifestInS3(manifestPath, manifestJSON)
+
+                linkString = '|'.join([
+                    itemNo,
+                    manifestURI,
+                    source,
+                    'application/webpub+json',
+                    '{"catalog": false, "download": false, "reader": true, "embed": false}'
+                ])
+
+                record.has_part.insert(0, linkString)
+                break
+
+    def createManifestInS3(self, manifestPath, manifestJSON):
+        self.putObjectInBucket(
+            manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
+        )
+
+    @staticmethod
+    def generateManifest(record, sourceURI, manifestURI):
+        manifest = WebpubManifest(sourceURI, 'application/pdf')
+
+        manifest.addMetadata(
+            record,
+            conformsTo=os.environ['WEBPUB_PDF_PROFILE']
+        )
+
+        manifest.addChapter(sourceURI, record.title)
+
+        manifest.links.append({
+            'rel': 'self',
+            'href': manifestURI,
+            'type': 'application/webpub+json'
+        })
+
+        return manifest.toJson()
+
+
+class UofMError(Exception):
+    pass
diff --git a/processes/__init__.py b/processes/__init__.py
@@ -17,3 +17,4 @@
 from .chicagoISAC import ChicagoISACProcess
 from .UofSC import UofSCProcess
 from .loc import LOCProcess
+from .UofM import UofMProcess
diff --git a/processes/chicagoISAC.py b/processes/chicagoISAC.py
@@ -28,7 +28,7 @@ def __init__(self, *args):
         self.createS3Client()
 
     def runProcess(self):
-        with open('chicagoISAC_metadata.json') as f:
+        with open('ingestJSONFiles/chicagoISAC_metadata.json') as f:
                 chicagoISACData = json.load(f)
 
         for metaDict in chicagoISACData:

diff --git a/tests/unit/test_UofM_mapping.py b/tests/unit/test_UofM_mapping.py
@@ -0,0 +1,49 @@
+import pytest
+
+from mappings.UofM import UofMMapping
+
+
+class TestUofMMapping:
+    @pytest.fixture
+    def testMapping(self):
+        class TestUofMMapping(UofMMapping):
+            def __init__(self):
+                self.mapping = None
+
+        return TestUofMMapping()
+
+    @pytest.fixture
+    def testRecordStandard(self, mocker):
+        return mocker.MagicMock(
+            title='testTitle',
+            authors=['testAuthor|||true'],
+            dates=['testDate|publication_date'],
+            publisher=['testPublisher||'],
+            identifiers=['testISBN|isbn', 'testOCLC|oclc'],
+            contributor=['testContributor|||contributor'],
+            subjects='testSubject'
+        )
+
+    def test_createMapping(self, testMapping):
+        recordMapping = testMapping.createMapping()
+
+        assert list(recordMapping.keys()) == [
+            'title', 'authors', 'dates', 'publisher',
+            'identifiers', 'contributors', 'subjects'
+        ]
+        assert recordMapping['title'] == ('Title', '{0}')
+
+    def test_applyFormatting_standard(self, testMapping, testRecordStandard):
+        testMapping.record = testRecordStandard
+
+        testMapping.applyFormatting()
+
+        assert testMapping.record.has_part == []
+        assert testMapping.record.source == 'UofM'
+        assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc']
+        assert testMapping.record.source_id == 'UofM_testOCLC'
+        assert testMapping.record.publisher == ['testPublisher||']
+        assert testMapping.record.spatial == 'Michigan'
+        assert testMapping.record.subjects == ['testSubject||']
+
+