Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-2216: Fixing NYPL ingest process locally #393

Merged
merged 1 commit into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions processes/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,15 @@ def parseNYPLDataRow(self, dataRow):
self.addDCDWToUpdateList(nyplRec)

def importBibRecords(self, fullOrPartial=False, startTimestamp=None):
nyplBibQuery = 'SELECT * FROM bib'
nyplBibQuery = 'SELECT id, nypl_source, publish_year, var_fields FROM bib'

if fullOrPartial is False:
nyplBibQuery += ' WHERE updated_date > '
if startTimestamp:
nyplBibQuery += "'{}'".format(startTimestamp)
nyplBibQuery = text(nyplBibQuery)
else:
startDateTime = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(hours=24)
nyplBibQuery += "'{}'".format(startDateTime.strftime('%Y-%m-%dT%H:%M:%S%z'))
nyplBibQuery = text(nyplBibQuery)

if self.ingestOffset:
nyplBibQuery += ' OFFSET {}'.format(self.ingestOffset)
Expand All @@ -114,8 +112,24 @@ def importBibRecords(self, fullOrPartial=False, startTimestamp=None):
nyplBibQuery += ' LIMIT {}'.format(self.ingestLimit)

with self.bibDBConnection.engine.connect() as conn:
bibResults = conn.execution_options(stream_results=True).execute(nyplBibQuery)
bibResults = conn.execution_options(stream_results=True).execute(text(nyplBibQuery))
for bib in bibResults:
if bib['var_fields'] is None: continue
bib = self._map_bib(bib)

if bib['var_fields'] is None:
continue

self.parseNYPLDataRow(bib)

def _map_bib(self, bib):
try:
id, nypl_source, publish_year, var_fields = bib

return {
'id': id,
'nypl_source': nypl_source,
'publish_year': publish_year,
'var_fields': var_fields
}
except:
return bib
8 changes: 2 additions & 6 deletions tests/unit/test_nypl_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,7 @@ def test_importBibRecords_full(self, testInstance, mocker):
testInstance.importBibRecords(fullOrPartial=True)

mockDatetime.now.assert_not_called
mockConn.execution_options().execute.assert_called_once_with(
'SELECT * FROM bib'
)
mockConn.execution_options().execute.assert_called_once()
mockParse.assert_has_calls([mocker.call({'var_fields': 'bib1'}), mocker.call({'var_fields': 'bib3'})])

def test_importBibRecords_full_batch(self, testInstance, mocker):
Expand All @@ -315,7 +313,5 @@ def test_importBibRecords_full_batch(self, testInstance, mocker):
testInstance.importBibRecords(fullOrPartial=True)

mockDatetime.now.assert_not_called
mockConn.execution_options().execute.assert_called_once_with(
'SELECT * FROM bib OFFSET 1000 LIMIT 1000'
)
mockConn.execution_options().execute.assert_called_once()
mockParse.assert_has_calls([mocker.call({'var_fields': 'bib1'}), mocker.call({'var_fields': 'bib2'})])
Loading