Skip to content

Commit

Permalink
Merge pull request #285 from NYPL/SFR-1775_UofMichiganIngestion
Browse files Browse the repository at this point in the history
SFR-1775_University Michigan Ingestion
  • Loading branch information
mitri-slory authored Jan 31, 2024
2 parents 3a31c81 + bb3e743 commit 016ddc8
Show file tree
Hide file tree
Showing 10 changed files with 6,544 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ config/local*
tags
# Vim
*.swp
/*venv*
/*venv*
8 changes: 4 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# CHANGELOG
## unreleased version -- v0.12.4
## Added
New /fulfill endpoint with ability to check for NYPL login in Bearer authorization header
Fulfill endpoint returns pre-signed URLs for objects in private buckets when user is logged in

## unreleased version -- v0.12.4
## Added
- New script to add nypl_login flag to Links objects
- Added nypl_login flag to nypl mapping
- New APIUtils method to generate a presigned url for S3 actions
- New /fulfill endpoint with ability to check for NYPL login in Bearer authorization header
- Fulfill endpoint returns pre-signed URLs for objects in private buckets when user is logged in
- Added new University of Michigan process and mapping for ingestion
- New directory for test JSON files that will be ingested
## Fixed
- NYPL records not being added due to SQLAlchemy error
- Bardo CCE API and Hathi DataFiles URL updated
Expand Down
6,142 changes: 6,142 additions & 0 deletions ingestJSONFiles/UofM_CSV.json

Large diffs are not rendered by default.

File renamed without changes.
81 changes: 81 additions & 0 deletions mappings/UofM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from .json import JSONMapping
import logging

class UofMMapping(JSONMapping):
def __init__(self, source):
super().__init__(source, {})
self.mapping = self.createMapping()

def createMapping(self):
return {
'title': ('Title', '{0}'),
'authors': ('Author(s)', '{0}'),
'dates': [('Pub Date', '{0}|publication_date')],
'publisher': [('Publisher (from Projects)', '{0}||')],
'identifiers': [
('ISBN', '{0}|isbn'),
('OCLC', '{0}|oclc')
],
'contributors': [('Contributors', '{0}|||contributor')],
'subjects': ('Subject 1', '{0}'),
}

def applyFormatting(self):
self.record.has_part = []
self.record.spatial = 'Michigan'
self.record.source = 'UofM'

if self.record.authors:
self.record.authors = self.formatAuthors()

if self.record.subjects:
self.record.subjects = self.formatSubjects()

if self.record.identifiers:
if len(self.record.identifiers) == 1:
source_id = self.record.identifiers[0].split('|')[0]
else:
source_id = self.record.identifiers[1].split('|')[0]

self.record.source_id = f'UofM_{source_id}'
self.record.identifiers = self.formatIdentifiers()

def formatAuthors(self):
authorList = []

if ';' in self.record.authors:
authorList = self.record.authors.split('; ')
newAuthorList = [f'{author}|||true' for author in authorList]
return newAuthorList
else:
authorList.append(f'{self.record.authors}|||true)')
return authorList

def formatSubjects(self):
subjectList = []

if '|' in self.record.subjects:
subjectList = self.record.subjects.split('|')
newSubjectList = [f'{subject}||' for subject in subjectList]
return newSubjectList
else:
subjectList.append(f'{self.record.subjects}||')
return subjectList

def formatIdentifiers(self):
if 'isbn' in self.record.identifiers[0]:
isbnString = self.record.identifiers[0].split('|')[0]
if ';' in isbnString:
isbnList = isbnString.split('; ')
newISBNList = [f'{isbn}|isbn' for isbn in isbnList]
if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]:
newISBNList.append(f'{self.record.identifiers[1]}')
return newISBNList
else:
return newISBNList

return self.record.identifiers




159 changes: 159 additions & 0 deletions processes/UofM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import json
import os
from requests.exceptions import HTTPError, ConnectionError
from botocore.exceptions import ClientError

from .core import CoreProcess
from urllib.error import HTTPError
from mappings.core import MappingError
from mappings.UofM import UofMMapping
from managers import WebpubManifest
from logger import createLog

logger = createLog(__name__)

class UofMProcess(CoreProcess):

def __init__(self, *args):
super(UofMProcess, self).__init__(*args[:4])

self.ingestOffset = int(args[5] or 0)
self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000
self.fullImport = self.process == 'complete'

# Connect to database
self.generateEngine()
self.createSession()

# S3 Configuration
self.s3Bucket = os.environ['FILE_BUCKET']
self.createS3Client()

def runProcess(self):
with open('ingestJSONFiles/UofM_CSV.json') as f:
UofMData = json.load(f)

for i in range(0, len(UofMData['data'])):
metaDict = UofMData['data'][i]
self.processUofMRecord(metaDict)

self.saveRecords()
self.commitChanges()

def processUofMRecord(self, record):
try:
UofMRec = UofMMapping(record)
UofMRec.applyMapping()
self.addHasPartMapping(record, UofMRec.record)
self.storePDFManifest(UofMRec.record)
self.addDCDWToUpdateList(UofMRec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
logger.exception(e)
logger.warn(UofMError('Unable to process UofM record'))

def addHasPartMapping(self, resultsRecord, record):
bucket = 'ump-pdf-repository'

try:
#The get_object method is to make sure the object with a specific bucket and key exists in S3
self.s3Client.get_object(Bucket=bucket,
Key=f'{resultsRecord["File ID 1"]}_060pct.pdf')
key = f'{resultsRecord["File ID 1"]}_060pct.pdf'
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'

linkString = '|'.join([
'1',
urlPDFObject,
'UofM',
'application/pdf',
'{"catalog": false, "download": true, "reader": false, "embed": false}'
])
record.has_part.append(linkString)

except ClientError or Exception or HTTPError as err:
if err.response['Error']['Code'] == 'NoSuchKey':
logger.info(UofMError("Key doesn't exist"))
else:
logger.info(UofMError("Object doesn't exist"))

if not record.has_part:
try:
#The get_object method is to make sure the object with a specific bucket and key exists in S3
self.s3Client.get_object(Bucket= 'ump-pdf-repository',
Key= f'{resultsRecord["File ID 1"]}_100pct.pdf')
key = f'{resultsRecord["File ID 1"]}_100pct.pdf'
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'

linkString = '|'.join([
'1',
urlPDFObject,
'UofM',
'application/pdf',
'{"catalog": false, "download": true, "reader": false, "embed": false}'
])
record.has_part.append(linkString)

except ClientError or Exception or HTTPError as err:
if err.response['Error']['Code'] == 'NoSuchKey':
logger.info(UofMError("Key doesn't exist"))
else:
logger.info(UofMError("Object doesn't exist"))



def storePDFManifest(self, record):
for link in record.has_part:
itemNo, uri, source, mediaType, flags = link.split('|')

if mediaType == 'application/pdf':
logger.warn(f'Identifiers: {record.identifiers}')
recordID = record.identifiers[0].split('|')[0]

manifestPath = 'manifests/{}/{}.json'.format(source, recordID)
manifestURI = 'https://{}.s3.amazonaws.com/{}'.format(
self.s3Bucket, manifestPath
)

manifestJSON = self.generateManifest(record, uri, manifestURI)

self.createManifestInS3(manifestPath, manifestJSON)

linkString = '|'.join([
itemNo,
manifestURI,
source,
'application/webpub+json',
'{"catalog": false, "download": false, "reader": true, "embed": false}'
])

record.has_part.insert(0, linkString)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')

manifest.addMetadata(
record,
conformsTo=os.environ['WEBPUB_PDF_PROFILE']
)

manifest.addChapter(sourceURI, record.title)

manifest.links.append({
'rel': 'self',
'href': manifestURI,
'type': 'application/webpub+json'
})

return manifest.toJson()


class UofMError(Exception):
pass
1 change: 1 addition & 0 deletions processes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
from .chicagoISAC import ChicagoISACProcess
from .UofSC import UofSCProcess
from .loc import LOCProcess
from .UofM import UofMProcess
2 changes: 1 addition & 1 deletion processes/chicagoISAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, *args):
self.createS3Client()

def runProcess(self):
with open('chicagoISAC_metadata.json') as f:
with open('ingestJSONFiles/chicagoISAC_metadata.json') as f:
chicagoISACData = json.load(f)

for metaDict in chicagoISACData:
Expand Down
49 changes: 49 additions & 0 deletions tests/unit/test_UofM_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest

from mappings.UofM import UofMMapping


class TestUofMMapping:
@pytest.fixture
def testMapping(self):
class TestUofMMapping(UofMMapping):
def __init__(self):
self.mapping = None

return TestUofMMapping()

@pytest.fixture
def testRecordStandard(self, mocker):
return mocker.MagicMock(
title='testTitle',
authors=['testAuthor|||true'],
dates=['testDate|publication_date'],
publisher=['testPublisher||'],
identifiers=['testISBN|isbn', 'testOCLC|oclc'],
contributor=['testContributor|||contributor'],
subjects='testSubject'
)

def test_createMapping(self, testMapping):
recordMapping = testMapping.createMapping()

assert list(recordMapping.keys()) == [
'title', 'authors', 'dates', 'publisher',
'identifiers', 'contributors', 'subjects'
]
assert recordMapping['title'] == ('Title', '{0}')

def test_applyFormatting_standard(self, testMapping, testRecordStandard):
testMapping.record = testRecordStandard

testMapping.applyFormatting()

assert testMapping.record.has_part == []
assert testMapping.record.source == 'UofM'
assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc']
assert testMapping.record.source_id == 'UofM_testOCLC'
assert testMapping.record.publisher == ['testPublisher||']
assert testMapping.record.spatial == 'Michigan'
assert testMapping.record.subjects == ['testSubject||']


Loading

0 comments on commit 016ddc8

Please sign in to comment.