From 980c47a79ac12933f44275497b287156db8d49b1 Mon Sep 17 00:00:00 2001
From: jbukhari <joe.bukhari@gmail.com>
Date: Mon, 25 Nov 2024 11:20:52 -0500
Subject: [PATCH] Maintenance (#172)

* exception handling

* set return values for the main function

* update logging

* annotation

* bugfix

* bump version

* update tests
---
 dlx_dl/scripts/sync/__init__.py | 36 ++++++++++++++++++++++++---------
 setup.py                        |  2 +-
 tests/test_dlx_dl.py            | 14 +++++++++++--
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/dlx_dl/scripts/sync/__init__.py b/dlx_dl/scripts/sync/__init__.py
index b255fd5..17a5d92 100644
--- a/dlx_dl/scripts/sync/__init__.py
+++ b/dlx_dl/scripts/sync/__init__.py
@@ -93,7 +93,13 @@ def param(name):
      
     return parser.parse_args()
 
-def run(**kwargs):
+def run(**kwargs) -> int:
+    """
+    Main function. Arguments are described and parsed in the `get_args` 
+    function. Returns the number of records updated, or -1 if export was
+    aborted. 
+    """
+
     args = get_args(**kwargs)
 
     if isinstance(kwargs.get('connect'), MockClient):
@@ -114,7 +120,7 @@ def run(**kwargs):
     BATCH_SIZE = 100
     SEEN = 0
     UPDATED_COUNT = 0
-    print(f'Checking {marcset.count} updated records')
+    print(f'Checking {marcset.count} records')
 
     # check if last update cleared in DL yet
     if args.force:
@@ -169,10 +175,13 @@ def run(**kwargs):
             try:
                 last_dl_record = Bib.from_xml_raw(record_xml, auth_control=False)
             except AssertionError as e:
-                if last_exported['export_type'] == 'NEW':
+                if status := last_exported.get('export_type') == 'NEW':
                     # last record not in DL yet
                     flag = 'NEW'
                     last_dl_record = None
+                elif status == None:
+                    # record was probably exported by dlx-dl-export
+                    pass
                 else:
                     raise Exception(f'Last updated record not found by DL search API: {last_exported["record_type"]} {last_exported["record_id"]}')
             
@@ -185,6 +194,7 @@ def run(**kwargs):
 
                 if last_exported['time'] > dl_last_updated:
                     flag = 'UPDATE'
+        
         if flag:
             # check callback log to see if the last export had an import error in DL
             q = {'record_type': last_exported['record_type'], 'record_id': last_exported['record_id']}
@@ -205,14 +215,14 @@ def run(**kwargs):
                 elif flag == 'NEW':
                     # the record has been imported to DL but isn't searchable yet
                     print(f'Awaiting search indexing of last new record: {args.type}# {last_exported["record_id"]}. Callback received indicating sucessful import @ {callback_data["time"]}.')
-                    exit()
+                    return -1
                 else:
                     # the record was exported and imported to DL succesfully, but DL did not record the update in
                     # the 005 field. this can happen if there were no changes to be made to the DL record.
                     warn(f'Possible redundant export not recorded in DL: {flag} {args.type}# {last_exported["record_id"]}')
             else:
                 print(f'Last update not cleared in DL yet ({flag}) ({args.type}# {last_exported["record_id"]} @ {last_exported["time"]})')
-                exit()
+                return -1
 
     # cycle through records in batches 
     enqueue, to_remove = False, []
@@ -301,7 +311,7 @@ def run(**kwargs):
                 dlx_record = next(filter(lambda x: x.id == dl_record.id, BATCH), None)
                 
                 if dlx_record is None:
-                    raise Exception('This shouldn\'t be possible. Possible network error.')
+                    raise Exception(f'Error matching {dl_record.id} with dlx record. This shouldn\'t be possible. Possible network error.\n{dl_record.to_mrk()}')
                 
                 # correct fields    
                 result = compare_and_update(args, dlx_record=dlx_record, dl_record=dl_record)
@@ -350,6 +360,8 @@ def run(**kwargs):
 
     print(f'Updated {UPDATED_COUNT} records')
 
+    return UPDATED_COUNT
+
 def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
     """
     Returns
@@ -384,6 +396,8 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
                     {'561.subfields.value': {'$in': fft_uris}},
                 ]
             }
+        else:
+            query = criteria
     else:
         query = criteria
     
@@ -577,9 +591,10 @@ def compare_and_update(args, *, dlx_record, dl_record):
         if normalize(field.to_mrk()) not in [normalize(x) for x in dlx_fields_serialized]:
             # compare tag + indicators
             if field.tag + ''.join(field.indicators) in [x.tag + ''.join(x.indicators) for x in dlx_fields]:
-                # this should already be taken care of in dlx->dl
-                print(f'{dlx_record.id}: SUPERCEDED: {field.to_mrk()}')
-                take_tags.add(field.tag)
+                if field.tag not in take_tags:
+                    # this should already be taken care of in dlx->dl
+                    print(f'{dlx_record.id}: SUPERSEDED: {field.to_mrk()}')
+                    take_tags.add(field.tag)
             else:
                 # delete fields where the tag + indicators combo does not exist in dl record
                 print(f'{dlx_record.id}: TO DELETE: {field.to_mrk()}')
@@ -757,7 +772,8 @@ def submit_to_dl(args, record, *, mode, export_start, export_type):
     DB.handle[export.LOG_COLLECTION].insert_one(logdata)
     logdata['export_start'] = logdata['export_start'].isoformat()
     logdata['time'] = logdata['time'].isoformat()
-    print(logdata)
+    logdata.pop('_id', None)
+    print(json.dumps(logdata))
 
     return logdata
 
diff --git a/setup.py b/setup.py
index 552738d..937d6f5 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name = 'dlx_dl',
-    version = '1.2.10',
+    version = '1.2.11',
     url = 'http://github.com/dag-hammarskjold-library/dlx-dl',
     author = 'United Nations Dag Hammarskjöld Library',
     author_email = 'library-ny@un.org',
diff --git a/tests/test_dlx_dl.py b/tests/test_dlx_dl.py
index 8ae67f3..f1850c8 100644
--- a/tests/test_dlx_dl.py
+++ b/tests/test_dlx_dl.py
@@ -224,17 +224,27 @@ def test_561(db, tmp_path):
 
 def test_sync(db, capsys, mock_get_post):
     # todo: expand this test
+    import json
     from http.server import HTTPServer
     from dlx import DB
     from dlx.marc import Bib
     
     bib = Bib().set('245', 'a', 'Will self destruct')
     bib.commit()
-    bib.delete()
 
+    sync.run(connect=db, source='test', type='bib', modified_within=100, force=True)
+    
+    data = list(filter(None, capsys.readouterr().out.split('\n')))
+    logged = json.loads(data[4])
+    assert logged.get('record_id') == bib.id
+    assert logged.get('response_code') == 200
+    assert DB.handle['dlx_dl_log'].find_one({'record_id': bib.id})
+
+    # no files
+    DB.handle['files'].delete_many({})
     sync.run(connect=db, source='test', type='bib', modified_within=100, force=True)
     data = list(filter(None, capsys.readouterr().out.split('\n')))
-    assert data
+    assert 'Updated 3 records' in data[-1]
 
     # skip special user
     bib = Bib().set('245', 'a', 'Moar testing')