From a6e0df60716fa60f1f7501176d08dec7fa9f067f Mon Sep 17 00:00:00 2001
From: "J. Bukhari" <jb@Js-MacBook-Air.local>
Date: Fri, 29 Mar 2024 13:19:03 -0400
Subject: [PATCH 1/2] check for files using 561 and create FFT

---
 dlx_dl/scripts/export/__init__.py | 32 +++++++++++++++++++++++++++++++
 dlx_dl/scripts/sync/__init__.py   | 18 ++++++++++++++---
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/dlx_dl/scripts/export/__init__.py b/dlx_dl/scripts/export/__init__.py
index 35dc1c4..6a1ff36 100644
--- a/dlx_dl/scripts/export/__init__.py
+++ b/dlx_dl/scripts/export/__init__.py
@@ -1,3 +1,4 @@
+from fileinput import filename
 import os, sys, math, re, requests, json
 from io import StringIO
 import boto3
@@ -328,6 +329,7 @@ def process_bib(bib, *, blacklisted, files_only):
     
     bib.delete_field('005')
     bib = _035(bib)
+    bib = _561(bib)
     bib = _856(bib)
     
     if bib.get_value('980', 'a') == 'DELETED':
@@ -381,6 +383,34 @@ def _035(record):
     
     return record
     
+def _561(bib):
+    uris = bib.get_values('561', 'u')
+    place, seen = 0, []
+
+    for uri in uris:
+        if files := list(File.find_by_identifier(Identifier('uri', uri))):
+            latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]
+            _fft = Datafield('FFT', record_type='bib')
+            _fft.set('a', latest.uri)
+    
+            old_fn = latest.filename if latest.filename else uri.split('/')[-1]
+            new_fn = clean_fn(old_fn)
+            parts = new_fn.split('.')
+            base = ''.join(parts[0:-1])
+
+            if base in seen:
+                # files can't have the same base name regardless of extension
+                ext = parts[-1]
+                new_fn = f'{base}_{place}.{ext}'
+            else:
+                seen.append(base)
+            
+            _fft.set('n', new_fn)
+            bib.fields.append(_fft)
+            place += 1
+
+    return bib
+
 def _856(bib):
     place = len(bib.get_fields('FFT'))
     seen = []
@@ -390,6 +420,7 @@ def _856(bib):
         parsed = urlparse(url)
         
         if parsed.netloc in WHITELIST:
+            # whitelist contains domains of file urls to create FFTs from
             url_path = parsed.path.rstrip()
             
             if unquote(url_path) == url_path:
@@ -402,6 +433,7 @@ def _856(bib):
             base = ''.join(parts[0:-1])
 
             if base in seen:
+                # files can't have the same base name regardless of extension
                 ext = parts[-1]
                 new_fn = f'{base}_{place}.{ext}'
             else:
diff --git a/dlx_dl/scripts/sync/__init__.py b/dlx_dl/scripts/sync/__init__.py
index 9cf5943..9037340 100644
--- a/dlx_dl/scripts/sync/__init__.py
+++ b/dlx_dl/scripts/sync/__init__.py
@@ -310,7 +310,7 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
         else {'updated': criteria}
     
     # sort to ensure latest updates are checked first
-    rset = cls.from_query(query, sort=[('updated', DESC)])
+    rset = cls.from_query(query, sort=[('updated', DESC)], collation=Config.marc_index_default_collation)
 
     return rset
     
@@ -378,10 +378,10 @@ def get_records(args, log=None, queue=None):
             records = cls.from_query({'_id': {'$in': ids}})
     elif args.query:
         query = args.query.replace('\'', '"')
-        records = cls.from_query(json.loads(query))
+        records = cls.from_query(json.loads(query), collation=Config.marc_index_default_collation)
     elif args.querystring:
         query = Query.from_string(args.querystring, record_type=args.type)
-        records = cls.from_query(query)
+        records = cls.from_query(query, collation=Config.marc_index_default_collation)
     else:
         raise Exception('One of the criteria arguments is required')
 
@@ -631,6 +631,18 @@ def normalize(x): return unicodedata.normalize('NFD', x)
 
                 return export_whole_record(args, dlx_record, export_type='UPDATE')
 
+    # records with file URI in 561
+    uris = dlx_record.get_values('561', 'u')
+
+    for uri in uris:
+        if files := list(File.find_by_identifier(Identifier('uri', uri))):
+            latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]
+             
+        else:
+            pass #print(bib.id)
+
+
+    
     # official doc files
     symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []
     #symbols = dlx_record.get_values('191', 'a') if args.type == 'bib' else []

From a34b022fd14a33cb5925eb23315550435fedfb09 Mon Sep 17 00:00:00 2001
From: "J. Bukhari" <jb@Js-MacBook-Air.local>
Date: Fri, 29 Mar 2024 14:41:05 -0400
Subject: [PATCH 2/2] check for files using 561

---
 dlx_dl/scripts/export/__init__.py |  1 -
 dlx_dl/scripts/sync/__init__.py   | 57 +++++++++++++++++--------------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/dlx_dl/scripts/export/__init__.py b/dlx_dl/scripts/export/__init__.py
index 6a1ff36..100c50f 100644
--- a/dlx_dl/scripts/export/__init__.py
+++ b/dlx_dl/scripts/export/__init__.py
@@ -1,4 +1,3 @@
-from fileinput import filename
 import os, sys, math, re, requests, json
 from io import StringIO
 import boto3
diff --git a/dlx_dl/scripts/sync/__init__.py b/dlx_dl/scripts/sync/__init__.py
index 9037340..4474b4c 100644
--- a/dlx_dl/scripts/sync/__init__.py
+++ b/dlx_dl/scripts/sync/__init__.py
@@ -586,6 +586,31 @@ def normalize(x): return unicodedata.normalize('NFD', x)
 
         seen.append(field.to_mrk())
 
+    # for comparing the filenames from dl record 856 with dlx filename
+    def _get_dl_856(fn):
+        fn = export.clean_fn(fn)
+
+        # chars requiring encoding
+        fn = fn.replace('%', '%25')
+        #fn = fn.replace('^', '%5E')
+        #fn = quote(fn)
+
+        if unquote(fn) == fn:
+            fn = quote(fn)
+
+        dl_vals = [x.split('/')[-1] for x in dl_record.get_values('856', 'u')]
+
+        # remove extra chars if any
+        try:
+            dl_vals = [x[:len(fn)-fn[::-1].index('.')-1] + fn[-fn[::-1].index('.')-1:len(fn)] for x in dl_vals]
+        except ValueError:
+            pass
+        except Exception as e:
+            print(f'Error: {dlx_record.id}')
+            raise e
+
+        return dl_vals
+    
     # collector tool files
     for field in dlx_record.get_fields('856'):
         if field.get_value('3') == 'Thumbnail':
@@ -605,28 +630,8 @@ def normalize(x): return unicodedata.normalize('NFD', x)
                 return export_whole_record(args, dlx_record, export_type='UPDATE')
 
             fn = url.split('/')[-1]
-            fn = export.clean_fn(fn)
-
-            # chars requiring encoding
-            fn = fn.replace('%', '%25')
-            #fn = fn.replace('^', '%5E')
-            #fn = quote(fn)
-
-            if unquote(fn) == fn:
-                fn = quote(fn)
-
-            dl_vals = [x.split('/')[-1] for x in dl_record.get_values('856', 'u')]
-
-            # remove extra chars if any
-            try:
-                dl_vals = [x[:len(fn)-fn[::-1].index('.')-1] + fn[-fn[::-1].index('.')-1:len(fn)] for x in dl_vals]
-            except ValueError:
-                pass
-            except Exception as e:
-                print(f'Error: {dlx_record.id}')
-                raise e
-
-            if fn not in dl_vals:
+            
+            if fn not in _get_dl_856(fn):
                 print(f'{dlx_record.id}: FILE NOT FOUND ' + url)
 
                 return export_whole_record(args, dlx_record, export_type='UPDATE')
@@ -637,11 +642,13 @@ def normalize(x): return unicodedata.normalize('NFD', x)
     for uri in uris:
         if files := list(File.find_by_identifier(Identifier('uri', uri))):
             latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]
-             
-        else:
-            pass #print(bib.id)
+            # filename and size should be same in DL
+            fn = uri.split('/')[-1]
 
+            if fn not in _get_dl_856(fn):
+                print(f'{dlx_record.id}: FILE NOT FOUND ' + uri)
 
+                return export_whole_record(args, dlx_record, export_type='UPDATE')
     
     # official doc files
     symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []