-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #285 from NYPL/SFR-1775_UofMichiganIngestion
SFR-1775_University Michigan Ingestion
- Loading branch information
Showing
10 changed files
with
6,544 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,4 +13,4 @@ config/local* | |
tags | ||
# Vim | ||
*.swp | ||
/*venv* | ||
/*venv* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from .json import JSONMapping | ||
import logging | ||
|
||
class UofMMapping(JSONMapping): | ||
def __init__(self, source): | ||
super().__init__(source, {}) | ||
self.mapping = self.createMapping() | ||
|
||
def createMapping(self): | ||
return { | ||
'title': ('Title', '{0}'), | ||
'authors': ('Author(s)', '{0}'), | ||
'dates': [('Pub Date', '{0}|publication_date')], | ||
'publisher': [('Publisher (from Projects)', '{0}||')], | ||
'identifiers': [ | ||
('ISBN', '{0}|isbn'), | ||
('OCLC', '{0}|oclc') | ||
], | ||
'contributors': [('Contributors', '{0}|||contributor')], | ||
'subjects': ('Subject 1', '{0}'), | ||
} | ||
|
||
def applyFormatting(self): | ||
self.record.has_part = [] | ||
self.record.spatial = 'Michigan' | ||
self.record.source = 'UofM' | ||
|
||
if self.record.authors: | ||
self.record.authors = self.formatAuthors() | ||
|
||
if self.record.subjects: | ||
self.record.subjects = self.formatSubjects() | ||
|
||
if self.record.identifiers: | ||
if len(self.record.identifiers) == 1: | ||
source_id = self.record.identifiers[0].split('|')[0] | ||
else: | ||
source_id = self.record.identifiers[1].split('|')[0] | ||
|
||
self.record.source_id = f'UofM_{source_id}' | ||
self.record.identifiers = self.formatIdentifiers() | ||
|
||
def formatAuthors(self): | ||
authorList = [] | ||
|
||
if ';' in self.record.authors: | ||
authorList = self.record.authors.split('; ') | ||
newAuthorList = [f'{author}|||true' for author in authorList] | ||
return newAuthorList | ||
else: | ||
authorList.append(f'{self.record.authors}|||true)') | ||
return authorList | ||
|
||
def formatSubjects(self): | ||
subjectList = [] | ||
|
||
if '|' in self.record.subjects: | ||
subjectList = self.record.subjects.split('|') | ||
newSubjectList = [f'{subject}||' for subject in subjectList] | ||
return newSubjectList | ||
else: | ||
subjectList.append(f'{self.record.subjects}||') | ||
return subjectList | ||
|
||
def formatIdentifiers(self): | ||
if 'isbn' in self.record.identifiers[0]: | ||
isbnString = self.record.identifiers[0].split('|')[0] | ||
if ';' in isbnString: | ||
isbnList = isbnString.split('; ') | ||
newISBNList = [f'{isbn}|isbn' for isbn in isbnList] | ||
if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]: | ||
newISBNList.append(f'{self.record.identifiers[1]}') | ||
return newISBNList | ||
else: | ||
return newISBNList | ||
|
||
return self.record.identifiers | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import json | ||
import os | ||
from requests.exceptions import HTTPError, ConnectionError | ||
from botocore.exceptions import ClientError | ||
|
||
from .core import CoreProcess | ||
from urllib.error import HTTPError | ||
from mappings.core import MappingError | ||
from mappings.UofM import UofMMapping | ||
from managers import WebpubManifest | ||
from logger import createLog | ||
|
||
logger = createLog(__name__) | ||
|
||
class UofMProcess(CoreProcess): | ||
|
||
def __init__(self, *args): | ||
super(UofMProcess, self).__init__(*args[:4]) | ||
|
||
self.ingestOffset = int(args[5] or 0) | ||
self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000 | ||
self.fullImport = self.process == 'complete' | ||
|
||
# Connect to database | ||
self.generateEngine() | ||
self.createSession() | ||
|
||
# S3 Configuration | ||
self.s3Bucket = os.environ['FILE_BUCKET'] | ||
self.createS3Client() | ||
|
||
def runProcess(self): | ||
with open('ingestJSONFiles/UofM_CSV.json') as f: | ||
UofMData = json.load(f) | ||
|
||
for i in range(0, len(UofMData['data'])): | ||
metaDict = UofMData['data'][i] | ||
self.processUofMRecord(metaDict) | ||
|
||
self.saveRecords() | ||
self.commitChanges() | ||
|
||
def processUofMRecord(self, record): | ||
try: | ||
UofMRec = UofMMapping(record) | ||
UofMRec.applyMapping() | ||
self.addHasPartMapping(record, UofMRec.record) | ||
self.storePDFManifest(UofMRec.record) | ||
self.addDCDWToUpdateList(UofMRec) | ||
|
||
except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e: | ||
logger.exception(e) | ||
logger.warn(UofMError('Unable to process UofM record')) | ||
|
||
def addHasPartMapping(self, resultsRecord, record): | ||
bucket = 'ump-pdf-repository' | ||
|
||
try: | ||
#The get_object method is to make sure the object with a specific bucket and key exists in S3 | ||
self.s3Client.get_object(Bucket=bucket, | ||
Key=f'{resultsRecord["File ID 1"]}_060pct.pdf') | ||
key = f'{resultsRecord["File ID 1"]}_060pct.pdf' | ||
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}' | ||
|
||
linkString = '|'.join([ | ||
'1', | ||
urlPDFObject, | ||
'UofM', | ||
'application/pdf', | ||
'{"catalog": false, "download": true, "reader": false, "embed": false}' | ||
]) | ||
record.has_part.append(linkString) | ||
|
||
except ClientError or Exception or HTTPError as err: | ||
if err.response['Error']['Code'] == 'NoSuchKey': | ||
logger.info(UofMError("Key doesn't exist")) | ||
else: | ||
logger.info(UofMError("Object doesn't exist")) | ||
|
||
if not record.has_part: | ||
try: | ||
#The get_object method is to make sure the object with a specific bucket and key exists in S3 | ||
self.s3Client.get_object(Bucket= 'ump-pdf-repository', | ||
Key= f'{resultsRecord["File ID 1"]}_100pct.pdf') | ||
key = f'{resultsRecord["File ID 1"]}_100pct.pdf' | ||
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}' | ||
|
||
linkString = '|'.join([ | ||
'1', | ||
urlPDFObject, | ||
'UofM', | ||
'application/pdf', | ||
'{"catalog": false, "download": true, "reader": false, "embed": false}' | ||
]) | ||
record.has_part.append(linkString) | ||
|
||
except ClientError or Exception or HTTPError as err: | ||
if err.response['Error']['Code'] == 'NoSuchKey': | ||
logger.info(UofMError("Key doesn't exist")) | ||
else: | ||
logger.info(UofMError("Object doesn't exist")) | ||
|
||
|
||
|
||
def storePDFManifest(self, record): | ||
for link in record.has_part: | ||
itemNo, uri, source, mediaType, flags = link.split('|') | ||
|
||
if mediaType == 'application/pdf': | ||
logger.warn(f'Identifiers: {record.identifiers}') | ||
recordID = record.identifiers[0].split('|')[0] | ||
|
||
manifestPath = 'manifests/{}/{}.json'.format(source, recordID) | ||
manifestURI = 'https://{}.s3.amazonaws.com/{}'.format( | ||
self.s3Bucket, manifestPath | ||
) | ||
|
||
manifestJSON = self.generateManifest(record, uri, manifestURI) | ||
|
||
self.createManifestInS3(manifestPath, manifestJSON) | ||
|
||
linkString = '|'.join([ | ||
itemNo, | ||
manifestURI, | ||
source, | ||
'application/webpub+json', | ||
'{"catalog": false, "download": false, "reader": true, "embed": false}' | ||
]) | ||
|
||
record.has_part.insert(0, linkString) | ||
break | ||
|
||
def createManifestInS3(self, manifestPath, manifestJSON): | ||
self.putObjectInBucket( | ||
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket | ||
) | ||
|
||
@staticmethod | ||
def generateManifest(record, sourceURI, manifestURI): | ||
manifest = WebpubManifest(sourceURI, 'application/pdf') | ||
|
||
manifest.addMetadata( | ||
record, | ||
conformsTo=os.environ['WEBPUB_PDF_PROFILE'] | ||
) | ||
|
||
manifest.addChapter(sourceURI, record.title) | ||
|
||
manifest.links.append({ | ||
'rel': 'self', | ||
'href': manifestURI, | ||
'type': 'application/webpub+json' | ||
}) | ||
|
||
return manifest.toJson() | ||
|
||
|
||
class UofMError(Exception): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import pytest | ||
|
||
from mappings.UofM import UofMMapping | ||
|
||
|
||
class TestUofMMapping: | ||
@pytest.fixture | ||
def testMapping(self): | ||
class TestUofMMapping(UofMMapping): | ||
def __init__(self): | ||
self.mapping = None | ||
|
||
return TestUofMMapping() | ||
|
||
@pytest.fixture | ||
def testRecordStandard(self, mocker): | ||
return mocker.MagicMock( | ||
title='testTitle', | ||
authors=['testAuthor|||true'], | ||
dates=['testDate|publication_date'], | ||
publisher=['testPublisher||'], | ||
identifiers=['testISBN|isbn', 'testOCLC|oclc'], | ||
contributor=['testContributor|||contributor'], | ||
subjects='testSubject' | ||
) | ||
|
||
def test_createMapping(self, testMapping): | ||
recordMapping = testMapping.createMapping() | ||
|
||
assert list(recordMapping.keys()) == [ | ||
'title', 'authors', 'dates', 'publisher', | ||
'identifiers', 'contributors', 'subjects' | ||
] | ||
assert recordMapping['title'] == ('Title', '{0}') | ||
|
||
def test_applyFormatting_standard(self, testMapping, testRecordStandard): | ||
testMapping.record = testRecordStandard | ||
|
||
testMapping.applyFormatting() | ||
|
||
assert testMapping.record.has_part == [] | ||
assert testMapping.record.source == 'UofM' | ||
assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc'] | ||
assert testMapping.record.source_id == 'UofM_testOCLC' | ||
assert testMapping.record.publisher == ['testPublisher||'] | ||
assert testMapping.record.spatial == 'Michigan' | ||
assert testMapping.record.subjects == ['testSubject||'] | ||
|
||
|
Oops, something went wrong.