Merge branch 'main' into dependabot/pip/urllib3-1.26.19

dag-hammarskjold-library · Jul 3, 2024 · 426ba12 · 426ba12
2 parents 5a633bd + a245bbd
commit 426ba12
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 27 deletions.
diff --git a/dlx_dl/scripts/export/__init__.py b/dlx_dl/scripts/export/__init__.py
@@ -391,6 +391,7 @@ def _561(bib):
             latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]
             _fft = Datafield('FFT', record_type='bib')
             _fft.set('a', 'https://' + latest.uri)
+            _fft.set('d', ', '.join([ISO_STR.get(x, '') for x in latest.languages]))
 
             old_fn = latest.filename if latest.filename else uri.split('/')[-1]
             new_fn = clean_fn(old_fn)

diff --git a/dlx_dl/scripts/sync/__init__.py b/dlx_dl/scripts/sync/__init__.py
@@ -79,6 +79,8 @@ def param(name):
 
     # if run as function convert args to sys.argv so they can be parsed by ArgumentParser
     if kwargs:
+        sys.argv = [sys.argv[0]] # clear any existing command line args
+
         for key, val in kwargs.items():
             if val == True:
                 # boolean args
@@ -90,10 +92,10 @@ def param(name):
                 sys.argv.append(f'--{key}={val}')
 
     return parser.parse_args()
-    
+
 def run(**kwargs):
     args = get_args(**kwargs)
-    
+
     if isinstance(kwargs.get('connect'), MockClient):
         # required for testing 
         DB.client = kwargs['connect']
@@ -106,9 +108,10 @@ def run(**kwargs):
 
     HEADERS = {'Authorization': 'Token ' + args.api_key}
     records = get_records(args) # returns an interator  (dlx.Marc.BibSet/AuthSet)
-    BATCH, BATCH_SIZE, SEEN, TOTAL, INDEX = [], 100, 0, records.count, {}
+    #deleted = get_deleted_records(args)
+    BATCH, BATCH_SIZE, SEEN, TOTAL, INDEX = [], 100, 0, records.total_count, {}
     updated_count = 0
-    print(f'checking {TOTAL} updated records')
+    print(f'checking {records.count} updated records')
 
     # check if last update cleared in DL yet
     if args.force:
@@ -200,7 +203,11 @@ def run(**kwargs):
         print('building auth cache...')
         Auth.build_cache()
 
-    for i, record in enumerate(records):
+    for i, record in enumerate(records.records):
+        if record.user[:10] == 'batch_edit':
+            # skip syncing batch edited records for now so as not to overwhelm DL queue
+            continue
+
         BATCH.append(record)
         SEEN = i + 1
 
@@ -233,7 +240,7 @@ def run(**kwargs):
             col = root.find(f'{NS}collection')
 
             # process DL XML
-            for r in col or []:
+            for r in [] if col is None else col:
                 dl_record = Bib.from_xml_raw(r)
                 _035 = next(filter(lambda x: re.match('^\(DHL', x), dl_record.get_values('035', 'a')), '')
 
@@ -247,18 +254,24 @@ def run(**kwargs):
             for dlx_record in BATCH:
                 if dlx_record.get_value('245', 'a')[0:16].lower() == 'work in progress':
                     continue
-
-                if dlx_record.id not in [x.id for x in DL_BATCH]:
-                    if 'DELETED' in (dlx_record.get_value('980', 'a'), record.get_value('980', 'c')):
-                        pass
-                    else:
-                        print(f'{dlx_record.id}: NOT FOUND IN DL')
-
-                        export_whole_record(args, dlx_record, export_type='NEW')
-                        updated_count += 1
+
+                if dlx_record.get_value('980', 'a') == 'DELETED':
+                    if dl_record := next(filter(lambda x: x.id == dlx_record.id, DL_BATCH), None):
+                        if dl_record.get_value('980', 'a') != 'DELETED':
+                            print(f'{dlx_record.id}: RECORD DELETED')
+                            export_whole_record(args, dlx_record, export_type='DELETE')
+                            updated_count += 1
+
+                        # remove record from list of DL records to compare
+                        #DL_BATCH = list(filter(lambda x: x.id != dlx_record.id))
+                        DL_BATCH.remove(dl_record)
+                elif dlx_record.id not in [x.id for x in DL_BATCH]:
+                    print(f'{dlx_record.id}: NOT FOUND IN DL')
+                    export_whole_record(args, dlx_record, export_type='NEW')
+                    updated_count += 1
 
-                    # remove from queue
-                    to_remove.append(dlx_record.id)
+                # remove from queue
+                to_remove.append(dlx_record.id)
 
             # scan and compare DL records
             for dl_record in DL_BATCH:
@@ -320,20 +333,24 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
     -------
     BibSet / AuthSet
     """
-    if cls == BibSet:
+    if cls == BibSet and not delete_only:
         fft_symbols = export._new_file_symbols(date_from, date_to)
 
         if len(fft_symbols) > 10000:
             raise Exception('that\'s too many file symbols to look up, sorry :(')
 
         print(f'found files for {len(fft_symbols)} symbols')
+    else:
+        fft_symbols = None
 
     if date_to:
         criteria = {'$and': [{'updated': {'$gte': date_from}}, {'updated': {'$lte': date_to}}]}
-        history_criteria = {'$and': [{'deleted.time': {'$gte': date_from}}, {'deleted.time': {'$lte': date_to}}]}
+        history_criteria = {'$and': [{'deleted.time': {'$gte': date_from}}, {'deleted.time': {'$lte': date_to}}, {'deleted.user': {'$ne': 'HZN'}}]}
     else:
         criteria = {'updated': {'$gte': date_from}}
-        history_criteria = {'deleted.time': {'$gte': date_from}}
+        history_criteria = {'deleted.time': {'$gte': date_from}, 'deleted.user': {'$ne': 'HZN'}}
+
+    history_criteria
 
     if cls == BibSet and fft_symbols:
         query = {'$or': [criteria, {'191.subfields.value': {'$in': fft_symbols}}]}
@@ -351,20 +368,24 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
     else:
         rset = cls.from_query(query, sort=[('updated', -1)], collation=Config.marc_index_default_collation)
 
+    to_delete = []
+
     if deleted:
-        #records = list(rset.records)
         rcls = Bib if cls == BibSet else Auth
-        to_delete = []
 
         for d in deleted:
             r = rcls({'_id': d['_id']})
             r.set('980', 'a', 'DELETED')
             r.updated = d['deleted']['time']
+            r.user = d['deleted']['user']
             to_delete.append(r)
 
         rset.records = (r for r in chain((r for r in rset.records), (d for d in  to_delete))) # program is expecting an iterable
 
-        print(f'Checking {len(to_delete)} deleted records')
+    print(f'Checking {len(to_delete)} deleted records')
+
+    # todo: enalbe MarcSet.count to handle hybrid cursor/list record sets
+    rset.total_count = rset.count + len(to_delete)
 
     return rset
 
@@ -425,6 +446,9 @@ def get_records(args, log=None, queue=None):
         q_args, q_kwargs = records.query_params
         records = cls.from_query({'$or': [{'_id': {'$in': list(qids)}}, q_args[0]]}, sort=[('updated', 1)])
 
+    # this value is expected to be set later
+    records.total_count = records.count
+
     return records
 
 def normalize(string):
@@ -623,7 +647,6 @@ def _get_dl_856(fn):
 
     # official doc files
     symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []
-    #symbols = dlx_record.get_values('191', 'a') if args.type == 'bib' else []
 
     for symbol in set(symbols):
         if symbol == '' or symbol == ' ' or symbol == '***': # note: clean these up in db

diff --git a/tests/test_dlx_dl.py b/tests/test_dlx_dl.py
@@ -1,4 +1,4 @@
-import os, pytest, responses
+import sys, os, pytest, responses
 from moto import mock_aws
 from datetime import datetime
 from dlx_dl.scripts import export, sync
@@ -38,8 +38,8 @@ def db():
     handle.seek(0)
     File.import_from_handle(
         handle,
-        filename='',
-        identifiers=[Identifier('symbol', 'TEST/1')],
+        filename='test 1',
+        identifiers=[Identifier('symbol', 'TEST/1'), Identifier('uri', 'test uri identifier')],
         languages=['EN'], 
         mimetype='text/plain', 
         source='test'
@@ -210,7 +210,20 @@ def test_delete(db, capsys, mock_post):
     #assert len(data) == 3
     #assert json.loads(data[2])['record_id'] == 3
 
+def test_561(db, tmp_path):
+    from io import BytesIO
+    from xmldiff.main import diff_texts
+    from dlx.marc import Bib
+
+    bib = Bib().set('561', 'u', 'test uri identifier')
+    bib.commit()
+    control = '<collection><record><datafield tag="035" ind1=" " ind2=" "><subfield code="a">(DHL)3</subfield></datafield><datafield tag="561" ind1=" " ind2=" "><subfield code="u">test uri identifier</subfield></datafield><datafield tag="980" ind1=" " ind2=" "><subfield code="a">BIB</subfield></datafield><datafield tag="FFT" ind1=" " ind2=" "><subfield code="a">https://mock_bucket.s3.amazonaws.com/1e50210a0202497fb79bc38b6ade6c34</subfield><subfield code="d">English</subfield><subfield code="n">.test_1</subfield></datafield></record></collection>'
+    out = tmp_path / 'out.xml'
+    export.run(connect=db, source='test', type='bib', id=bib.id, xml=out)
+    assert diff_texts(out.read_text(), control) == []
+
 def test_sync(db, capsys, mock_get_post):
+    # todo: expand this test
     from http.server import HTTPServer
     from dlx.marc import Bib