Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-1775_University Michigan Ingestion #285

Merged
merged 5 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ config/local*
tags
# Vim
*.swp
# Ingest JSON files
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
UofM_CSV.json
chicagoISAC_metadata.json
5 changes: 2 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# CHANGELOG
## unreleased version -- v0.12.4
## Added
New /fulfill endpoint with ability to check for NYPL login in Bearer authorization header

## unreleased version -- v0.12.4
## Added
- New script to add nypl_login flag to Links objects
- Added nypl_login flag to nypl mapping
- New APIUtils method to generate a presigned url for S3 actions
- New /fulfill endpoint with ability to check for NYPL login in Bearer authorization header
- Added new University of Michigan process and mapping for ingestion
## Fixed
- NYPL records not being added due to SQLAlchemy error
- Bardo CCE API and Hathi DataFiles URL updated
Expand Down
81 changes: 81 additions & 0 deletions mappings/UofM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from .json import JSONMapping
import logging

class UofMMapping(JSONMapping):
def __init__(self, source):
super().__init__(source, {})
self.mapping = self.createMapping()

def createMapping(self):
return {
'title': ('Title', '{0}'),
'authors': ('Author(s)', '{0}'),
'dates': [('Pub Date', '{0}|publication_date')],
'publisher': [('Publisher (from Projects)', '{0}||')],
'identifiers': [
('ISBN', '{0}|isbn'),
('OCLC', '{0}|oclc')
],
'contributors': [('Contributors', '{0}|||contributor')],
'subjects': ('Subject 1', '{0}'),
}

def applyFormatting(self):
self.record.has_part = []
self.record.spatial = 'Michigan'
self.record.source = 'UofM'

if self.record.authors:
self.record.authors = self.formatAuthors()

if self.record.subjects:
self.record.subjects = self.formatSubjects()

if self.record.identifiers:
if len(self.record.identifiers) == 1:
source_id = self.record.identifiers[0].split('|')[0]
else:
source_id = self.record.identifiers[1].split('|')[0]

self.record.source_id = f'UofM_{source_id}'
self.record.identifiers = self.formatIdentifiers()

def formatAuthors(self):
authorList = []

if ';' in self.record.authors:
authorList = self.record.authors.split('; ')
newAuthorList = [f'{author}|||true' for author in authorList]
return newAuthorList
else:
authorList.append(f'{self.record.authors}|||true)')
return authorList

def formatSubjects(self):
subjectList = []

if '|' in self.record.subjects:
subjectList = self.record.subjects.split('|')
newSubjectList = [f'{subject}||' for subject in subjectList]
return newSubjectList
else:
subjectList.append(f'{self.record.subjects}||')
return subjectList

def formatIdentifiers(self):
if 'isbn' in self.record.identifiers[0]:
isbnString = self.record.identifiers[0].split('|')[0]
if ';' in isbnString:
isbnList = isbnString.split('; ')
newISBNList = [f'{isbn}|isbn' for isbn in isbnList]
if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]:
newISBNList.append(f'{self.record.identifiers[1]}')
return newISBNList
else:
return newISBNList

return self.record.identifiers




159 changes: 159 additions & 0 deletions processes/UofM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import json
import os
from requests.exceptions import HTTPError, ConnectionError
from botocore.exceptions import ClientError

from .core import CoreProcess
from urllib.error import HTTPError
from mappings.core import MappingError
from mappings.UofM import UofMMapping
from managers import WebpubManifest
from logger import createLog

logger = createLog(__name__)

class UofMProcess(CoreProcess):

def __init__(self, *args):
super(UofMProcess, self).__init__(*args[:4])

self.ingestOffset = int(args[5] or 0)
self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000
self.fullImport = self.process == 'complete'

# Connect to database
self.generateEngine()
self.createSession()

# S3 Configuration
self.s3Bucket = os.environ['FILE_BUCKET']
self.createS3Client()

def runProcess(self):
with open('UofM_CSV.json') as f:
UofMData = json.load(f)

for i in range(0, 12):
metaDict = UofMData['data'][i]
self.processUofMRecord(metaDict)

self.saveRecords()
self.commitChanges()

def processUofMRecord(self, record):
try:
UofMRec = UofMMapping(record)
UofMRec.applyMapping()
self.addHasPartMapping(record, UofMRec.record)
self.storePDFManifest(UofMRec.record)
self.addDCDWToUpdateList(UofMRec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
logger.exception(e)
logger.warn(UofMError('Unable to process UofM record'))

def addHasPartMapping(self, resultsRecord, record):
bucket = 'ump-pdf-repository'

try:
#The get_object method is to make sure the object with a specific bucket and key exists in S3
self.s3Client.get_object(Bucket=bucket,
Key=f'{resultsRecord["File ID 1"]}_060pct.pdf')
key = f'{resultsRecord["File ID 1"]}_060pct.pdf'
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'

linkString = '|'.join([
'1',
urlPDFObject,
'UofM',
'application/pdf',
'{"catalog": false, "download": true, "reader": false, "embed": false}'
])
record.has_part.append(linkString)

except ClientError or Exception or HTTPError as err:
if err.response['Error']['Code'] == 'NoSuchKey':
logger.info(UofMError("Key doesn't exist"))
else:
logger.info(UofMError("Object doesn't exist"))

if not record.has_part:
try:
#The get_object method is to make sure the object with a specific bucket and key exists in S3
self.s3Client.get_object(Bucket= 'ump-pdf-repository',
Key= f'{resultsRecord["File ID 1"]}_100pct.pdf')
key = f'{resultsRecord["File ID 1"]}_100pct.pdf'
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'

linkString = '|'.join([
'1',
urlPDFObject,
'UofM',
'application/pdf',
'{"catalog": false, "download": true, "reader": false, "embed": false}'
])
record.has_part.append(linkString)

except ClientError or Exception or HTTPError as err:
if err.response['Error']['Code'] == 'NoSuchKey':
logger.info(UofMError("Key doesn't exist"))
else:
logger.info(UofMError("Object doesn't exist"))



def storePDFManifest(self, record):
for link in record.has_part:
itemNo, uri, source, mediaType, flags = link.split('|')

if mediaType == 'application/pdf':
logger.warn(f'Identifiers: {record.identifiers}')
recordID = record.identifiers[0].split('|')[0]

manifestPath = 'manifests/{}/{}.json'.format(source, recordID)
manifestURI = 'https://{}.s3.amazonaws.com/{}'.format(
self.s3Bucket, manifestPath
)

manifestJSON = self.generateManifest(record, uri, manifestURI)

self.createManifestInS3(manifestPath, manifestJSON)

linkString = '|'.join([
itemNo,
manifestURI,
source,
'application/webpub+json',
'{"catalog": false, "download": false, "reader": true, "embed": false}'
])

record.has_part.insert(0, linkString)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')

manifest.addMetadata(
record,
conformsTo=os.environ['WEBPUB_PDF_PROFILE']
)

manifest.addChapter(sourceURI, record.title)

manifest.links.append({
'rel': 'self',
'href': manifestURI,
'type': 'application/webpub+json'
})

return manifest.toJson()


class UofMError(Exception):
pass
1 change: 1 addition & 0 deletions processes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
from .chicagoISAC import ChicagoISACProcess
from .UofSC import UofSCProcess
from .loc import LOCProcess
from .UofM import UofMProcess
49 changes: 49 additions & 0 deletions tests/unit/test_UofM_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest

from mappings.UofM import UofMMapping


class TestUofMMapping:
@pytest.fixture
def testMapping(self):
class TestUofMMapping(UofMMapping):
def __init__(self):
self.mapping = None

return TestUofMMapping()

@pytest.fixture
def testRecordStandard(self, mocker):
return mocker.MagicMock(
title='testTitle',
authors=['testAuthor|||true'],
dates=['testDate|publication_date'],
publisher=['testPublisher||'],
identifiers=['testISBN|isbn', 'testOCLC|oclc'],
contributor=['testContributor|||contributor'],
subjects='testSubject'
)

def test_createMapping(self, testMapping):
recordMapping = testMapping.createMapping()

assert list(recordMapping.keys()) == [
'title', 'authors', 'dates', 'publisher',
'identifiers', 'contributors', 'subjects'
]
assert recordMapping['title'] == ('Title', '{0}')

def test_applyFormatting_standard(self, testMapping, testRecordStandard):
testMapping.record = testRecordStandard

testMapping.applyFormatting()

assert testMapping.record.has_part == []
assert testMapping.record.source == 'UofM'
assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc']
assert testMapping.record.source_id == 'UofM_testOCLC'
assert testMapping.record.publisher == ['testPublisher||']
assert testMapping.record.spatial == 'Michigan'
assert testMapping.record.subjects == ['testSubject||']


Loading