Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-2265: Refactored Chicago ISAC mapping and processes #411

Merged
merged 5 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion mappings/chicagoISAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def createMapping(self):
def applyFormatting(self):
self.record.source = 'isac'

#Formatting for multiple isbns
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
if ',' in self.record.identifiers[0]:
identifierArray = self.record.identifiers[0].split(', ')
updatedIdentifierArray = []
Expand Down
49 changes: 30 additions & 19 deletions processes/ingest/chicago_isac.py
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,33 +1,40 @@
import json
import os
from requests.exceptions import HTTPError, ConnectionError

from ..core import CoreProcess
from mappings.core import MappingError
from mappings.chicagoISAC import ChicagoISACMapping
from managers import WebpubManifest
from managers import DBManager, ElasticsearchManager, RedisManager, S3Manager, WebpubManifest
from logger import createLog

logger = createLog(__name__)

class ChicagoISACProcess(CoreProcess):

def __init__(self, *args):
super(ChicagoISACProcess, self).__init__(*args[:4])

self.ingestOffset = int(args[5] or 0)
self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
self.fullImport = self.process == 'complete'

# Connect to database
self.db_manager = DBManager()
self.elastic_search_manager = ElasticsearchManager()
self.redis_manager = RedisManager()
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved

self.generateEngine()
self.createSession()

# S3 Configuration
self.s3Bucket = os.environ['FILE_BUCKET']
self.createS3Client()
self.s3_manager = S3Manager()

def runProcess(self):
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved

logger.info('Starting ISAC process...')
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved

self.db_manager.generateEngine()
self.redis_manager.createRedisClient()
self.elastic_search_manager.createElasticConnection()
self.s3_manager.createS3Client()

with open('ingestJSONFiles/chicagoISAC_metadata.json') as f:
chicagoISACData = json.load(f)

Expand All @@ -44,9 +51,9 @@ def processChicagoISACRecord(self, record):
self.storePDFManifest(chicagoISACRec.record)
self.addDCDWToUpdateList(chicagoISACRec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
except Exception as e:
logger.exception(e)
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
logger.warn(ChicagoISACError('Unable to process ISAC record'))
logger.warning(ChicagoISACError('Unable to process ISAC record'))
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved


def storePDFManifest(self, record):
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -72,18 +79,22 @@ def storePDFManifest(self, record):
'application/webpub+json',
flags
])

record.has_part.insert(0, linkString)
#The second element of the has_part array will always be an array of pdfURLS from the json file
#Those elements of the array are popped out and appended back into the has_part array as strings instead of an array
for i in range(1, len(record.has_part)):
if len(record.has_part[i]) > 1:
urlArray = record.has_part[i]
record.has_part.pop(i)
for elem in urlArray:
record.has_part.append(elem)
else:
record.has_part[i] = ''.join(record.has_part[i])
self.hasPartArrayElemToStringElem(record)

break

@staticmethod
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
def hasPartArrayElemToStringElem(record):
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
for i in range(1, len(record.has_part)):
if len(record.has_part[i]) > 1:
urlArray = record.has_part[i]
record.has_part.pop(i)
for elem in urlArray:
record.has_part.append(elem)
else:
record.has_part[i] = ''.join(record.has_part[i])

def createManifestInS3(self, manifestPath, manifestJSON):
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
self.putObjectInBucket(
Expand Down
Loading