Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/urllib3-1.26.19
Browse files Browse the repository at this point in the history
  • Loading branch information
jbukhari authored Jul 3, 2024
2 parents 5a633bd + a245bbd commit 426ba12
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 27 deletions.
1 change: 1 addition & 0 deletions dlx_dl/scripts/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ def _561(bib):
latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]
_fft = Datafield('FFT', record_type='bib')
_fft.set('a', 'https://' + latest.uri)
_fft.set('d', ', '.join([ISO_STR.get(x, '') for x in latest.languages]))

old_fn = latest.filename if latest.filename else uri.split('/')[-1]
new_fn = clean_fn(old_fn)
Expand Down
71 changes: 47 additions & 24 deletions dlx_dl/scripts/sync/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def param(name):

# if run as function convert args to sys.argv so they can be parsed by ArgumentParser
if kwargs:
sys.argv = [sys.argv[0]] # clear any existing command line args

for key, val in kwargs.items():
if val == True:
# boolean args
Expand All @@ -90,10 +92,10 @@ def param(name):
sys.argv.append(f'--{key}={val}')

return parser.parse_args()

def run(**kwargs):
args = get_args(**kwargs)

if isinstance(kwargs.get('connect'), MockClient):
# required for testing
DB.client = kwargs['connect']
Expand All @@ -106,9 +108,10 @@ def run(**kwargs):

HEADERS = {'Authorization': 'Token ' + args.api_key}
records = get_records(args) # returns an interator (dlx.Marc.BibSet/AuthSet)
BATCH, BATCH_SIZE, SEEN, TOTAL, INDEX = [], 100, 0, records.count, {}
#deleted = get_deleted_records(args)
BATCH, BATCH_SIZE, SEEN, TOTAL, INDEX = [], 100, 0, records.total_count, {}
updated_count = 0
print(f'checking {TOTAL} updated records')
print(f'checking {records.count} updated records')

# check if last update cleared in DL yet
if args.force:
Expand Down Expand Up @@ -200,7 +203,11 @@ def run(**kwargs):
print('building auth cache...')
Auth.build_cache()

for i, record in enumerate(records):
for i, record in enumerate(records.records):
if record.user[:10] == 'batch_edit':
# skip syncing batch edited records for now so as not to overwhelm DL queue
continue

BATCH.append(record)
SEEN = i + 1

Expand Down Expand Up @@ -233,7 +240,7 @@ def run(**kwargs):
col = root.find(f'{NS}collection')

# process DL XML
for r in col or []:
for r in [] if col is None else col:
dl_record = Bib.from_xml_raw(r)
_035 = next(filter(lambda x: re.match('^\(DHL', x), dl_record.get_values('035', 'a')), '')

Expand All @@ -247,18 +254,24 @@ def run(**kwargs):
for dlx_record in BATCH:
if dlx_record.get_value('245', 'a')[0:16].lower() == 'work in progress':
continue

if dlx_record.id not in [x.id for x in DL_BATCH]:
if 'DELETED' in (dlx_record.get_value('980', 'a'), record.get_value('980', 'c')):
pass
else:
print(f'{dlx_record.id}: NOT FOUND IN DL')

export_whole_record(args, dlx_record, export_type='NEW')
updated_count += 1

if dlx_record.get_value('980', 'a') == 'DELETED':
if dl_record := next(filter(lambda x: x.id == dlx_record.id, DL_BATCH), None):
if dl_record.get_value('980', 'a') != 'DELETED':
print(f'{dlx_record.id}: RECORD DELETED')
export_whole_record(args, dlx_record, export_type='DELETE')
updated_count += 1

# remove record from list of DL records to compare
#DL_BATCH = list(filter(lambda x: x.id != dlx_record.id))
DL_BATCH.remove(dl_record)
elif dlx_record.id not in [x.id for x in DL_BATCH]:
print(f'{dlx_record.id}: NOT FOUND IN DL')
export_whole_record(args, dlx_record, export_type='NEW')
updated_count += 1

# remove from queue
to_remove.append(dlx_record.id)
# remove from queue
to_remove.append(dlx_record.id)

# scan and compare DL records
for dl_record in DL_BATCH:
Expand Down Expand Up @@ -320,20 +333,24 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
-------
BibSet / AuthSet
"""
if cls == BibSet:
if cls == BibSet and not delete_only:
fft_symbols = export._new_file_symbols(date_from, date_to)

if len(fft_symbols) > 10000:
raise Exception('that\'s too many file symbols to look up, sorry :(')

print(f'found files for {len(fft_symbols)} symbols')
else:
fft_symbols = None

if date_to:
criteria = {'$and': [{'updated': {'$gte': date_from}}, {'updated': {'$lte': date_to}}]}
history_criteria = {'$and': [{'deleted.time': {'$gte': date_from}}, {'deleted.time': {'$lte': date_to}}]}
history_criteria = {'$and': [{'deleted.time': {'$gte': date_from}}, {'deleted.time': {'$lte': date_to}}, {'deleted.user': {'$ne': 'HZN'}}]}
else:
criteria = {'updated': {'$gte': date_from}}
history_criteria = {'deleted.time': {'$gte': date_from}}
history_criteria = {'deleted.time': {'$gte': date_from}, 'deleted.user': {'$ne': 'HZN'}}

history_criteria

if cls == BibSet and fft_symbols:
query = {'$or': [criteria, {'191.subfields.value': {'$in': fft_symbols}}]}
Expand All @@ -351,20 +368,24 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
else:
rset = cls.from_query(query, sort=[('updated', -1)], collation=Config.marc_index_default_collation)

to_delete = []

if deleted:
#records = list(rset.records)
rcls = Bib if cls == BibSet else Auth
to_delete = []

for d in deleted:
r = rcls({'_id': d['_id']})
r.set('980', 'a', 'DELETED')
r.updated = d['deleted']['time']
r.user = d['deleted']['user']
to_delete.append(r)

rset.records = (r for r in chain((r for r in rset.records), (d for d in to_delete))) # program is expecting an iterable

print(f'Checking {len(to_delete)} deleted records')
print(f'Checking {len(to_delete)} deleted records')

# todo: enalbe MarcSet.count to handle hybrid cursor/list record sets
rset.total_count = rset.count + len(to_delete)

return rset

Expand Down Expand Up @@ -425,6 +446,9 @@ def get_records(args, log=None, queue=None):
q_args, q_kwargs = records.query_params
records = cls.from_query({'$or': [{'_id': {'$in': list(qids)}}, q_args[0]]}, sort=[('updated', 1)])

# this value is expected to be set later
records.total_count = records.count

return records

def normalize(string):
Expand Down Expand Up @@ -623,7 +647,6 @@ def _get_dl_856(fn):

# official doc files
symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []
#symbols = dlx_record.get_values('191', 'a') if args.type == 'bib' else []

for symbol in set(symbols):
if symbol == '' or symbol == ' ' or symbol == '***': # note: clean these up in db
Expand Down
19 changes: 16 additions & 3 deletions tests/test_dlx_dl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os, pytest, responses
import sys, os, pytest, responses
from moto import mock_aws
from datetime import datetime
from dlx_dl.scripts import export, sync
Expand Down Expand Up @@ -38,8 +38,8 @@ def db():
handle.seek(0)
File.import_from_handle(
handle,
filename='',
identifiers=[Identifier('symbol', 'TEST/1')],
filename='test 1',
identifiers=[Identifier('symbol', 'TEST/1'), Identifier('uri', 'test uri identifier')],
languages=['EN'],
mimetype='text/plain',
source='test'
Expand Down Expand Up @@ -210,7 +210,20 @@ def test_delete(db, capsys, mock_post):
#assert len(data) == 3
#assert json.loads(data[2])['record_id'] == 3

def test_561(db, tmp_path):
from io import BytesIO
from xmldiff.main import diff_texts
from dlx.marc import Bib

bib = Bib().set('561', 'u', 'test uri identifier')
bib.commit()
control = '<collection><record><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(DHL)3</subfield></datafield><datafield tag="561" ind1=" " ind2=" "><subfield code="u">test uri identifier</subfield></datafield><datafield tag="980" ind1=" " ind2=" "><subfield code="a">BIB</subfield></datafield><datafield tag="FFT" ind1=" " ind2=" "><subfield code="a">https://mock_bucket.s3.amazonaws.com/1e50210a0202497fb79bc38b6ade6c34</subfield><subfield code="d">English</subfield><subfield code="n">.test_1</subfield></datafield></record></collection>'
out = tmp_path / 'out.xml'
export.run(connect=db, source='test', type='bib', id=bib.id, xml=out)
assert diff_texts(out.read_text(), control) == []

def test_sync(db, capsys, mock_get_post):
# todo: expand this test
from http.server import HTTPServer
from dlx.marc import Bib

Expand Down

0 comments on commit 426ba12

Please sign in to comment.