From a6e0df60716fa60f1f7501176d08dec7fa9f067f Mon Sep 17 00:00:00 2001 From: "J. Bukhari" Date: Fri, 29 Mar 2024 13:19:03 -0400 Subject: [PATCH 1/2] check for files using 561 and create FFT --- dlx_dl/scripts/export/__init__.py | 32 +++++++++++++++++++++++++++++++ dlx_dl/scripts/sync/__init__.py | 18 ++++++++++++++--- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/dlx_dl/scripts/export/__init__.py b/dlx_dl/scripts/export/__init__.py index 35dc1c4..6a1ff36 100644 --- a/dlx_dl/scripts/export/__init__.py +++ b/dlx_dl/scripts/export/__init__.py @@ -1,3 +1,4 @@ +from fileinput import filename import os, sys, math, re, requests, json from io import StringIO import boto3 @@ -328,6 +329,7 @@ def process_bib(bib, *, blacklisted, files_only): bib.delete_field('005') bib = _035(bib) + bib = _561(bib) bib = _856(bib) if bib.get_value('980', 'a') == 'DELETED': @@ -381,6 +383,34 @@ def _035(record): return record +def _561(bib): + uris = bib.get_values('561', 'u') + place, seen = 0, [] + + for uri in uris: + if files := list(File.find_by_identifier(Identifier('uri', uri))): + latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0] + _fft = Datafield('FFT', record_type='bib') + _fft.set('a', latest.uri) + + old_fn = latest.filename if latest.filename else uri.split('/')[-1] + new_fn = clean_fn(old_fn) + parts = new_fn.split('.') + base = ''.join(parts[0:-1]) + + if base in seen: + # files can't have the same base name regardless of extension + ext = parts[-1] + new_fn = f'{base}_{place}.{ext}' + else: + seen.append(base) + + _fft.set('n', new_fn) + bib.fields.append(_fft) + place += 1 + + return bib + def _856(bib): place = len(bib.get_fields('FFT')) seen = [] @@ -390,6 +420,7 @@ def _856(bib): parsed = urlparse(url) if parsed.netloc in WHITELIST: + # whitelist contains domains of file urls to create FFTs from url_path = parsed.path.rstrip() if unquote(url_path) == url_path: @@ -402,6 +433,7 @@ def _856(bib): base = ''.join(parts[0:-1]) if base in seen: + # files can't have the same base name regardless of extension ext = parts[-1] new_fn = f'{base}_{place}.{ext}' else: diff --git a/dlx_dl/scripts/sync/__init__.py b/dlx_dl/scripts/sync/__init__.py index 9cf5943..9037340 100644 --- a/dlx_dl/scripts/sync/__init__.py +++ b/dlx_dl/scripts/sync/__init__.py @@ -310,7 +310,7 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False): else {'updated': criteria} # sort to ensure latest updates are checked first - rset = cls.from_query(query, sort=[('updated', DESC)]) + rset = cls.from_query(query, sort=[('updated', DESC)], collation=Config.marc_index_default_collation) return rset @@ -378,10 +378,10 @@ def get_records(args, log=None, queue=None): records = cls.from_query({'_id': {'$in': ids}}) elif args.query: query = args.query.replace('\'', '"') - records = cls.from_query(json.loads(query)) + records = cls.from_query(json.loads(query), collation=Config.marc_index_default_collation) elif args.querystring: query = Query.from_string(args.querystring, record_type=args.type) - records = cls.from_query(query) + records = cls.from_query(query, collation=Config.marc_index_default_collation) else: raise Exception('One of the criteria arguments is required') @@ -631,6 +631,18 @@ def normalize(x): return unicodedata.normalize('NFD', x) return export_whole_record(args, dlx_record, export_type='UPDATE') + # records with file URI in 561 + uris = dlx_record.get_values('561', 'u') + + for uri in uris: + if files := list(File.find_by_identifier(Identifier('uri', uri))): + latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0] + + else: + pass #print(bib.id) + + + # official doc files symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else [] #symbols = dlx_record.get_values('191', 'a') if args.type == 'bib' else [] From a34b022fd14a33cb5925eb23315550435fedfb09 Mon Sep 17 00:00:00 2001 From: "J. Bukhari" Date: Fri, 29 Mar 2024 14:41:05 -0400 Subject: [PATCH 2/2] check for files using 561 --- dlx_dl/scripts/export/__init__.py | 1 - dlx_dl/scripts/sync/__init__.py | 57 +++++++++++++++++-------------- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/dlx_dl/scripts/export/__init__.py b/dlx_dl/scripts/export/__init__.py index 6a1ff36..100c50f 100644 --- a/dlx_dl/scripts/export/__init__.py +++ b/dlx_dl/scripts/export/__init__.py @@ -1,4 +1,3 @@ -from fileinput import filename import os, sys, math, re, requests, json from io import StringIO import boto3 diff --git a/dlx_dl/scripts/sync/__init__.py b/dlx_dl/scripts/sync/__init__.py index 9037340..4474b4c 100644 --- a/dlx_dl/scripts/sync/__init__.py +++ b/dlx_dl/scripts/sync/__init__.py @@ -586,6 +586,31 @@ def normalize(x): return unicodedata.normalize('NFD', x) seen.append(field.to_mrk()) + # for comparing the filenames from dl record 856 with dlx filename + def _get_dl_856(fn): + fn = export.clean_fn(fn) + + # chars requiring encoding + fn = fn.replace('%', '%25') + #fn = fn.replace('^', '%5E') + #fn = quote(fn) + + if unquote(fn) == fn: + fn = quote(fn) + + dl_vals = [x.split('/')[-1] for x in dl_record.get_values('856', 'u')] + + # remove extra chars if any + try: + dl_vals = [x[:len(fn)-fn[::-1].index('.')-1] + fn[-fn[::-1].index('.')-1:len(fn)] for x in dl_vals] + except ValueError: + pass + except Exception as e: + print(f'Error: {dlx_record.id}') + raise e + + return dl_vals + # collector tool files for field in dlx_record.get_fields('856'): if field.get_value('3') == 'Thumbnail': @@ -605,28 +630,8 @@ def normalize(x): return unicodedata.normalize('NFD', x) return export_whole_record(args, dlx_record, export_type='UPDATE') fn = url.split('/')[-1] - fn = export.clean_fn(fn) - - # chars requiring encoding - fn = fn.replace('%', '%25') - #fn = fn.replace('^', '%5E') - #fn = quote(fn) - - if unquote(fn) == fn: - fn = quote(fn) - - dl_vals = [x.split('/')[-1] for x in dl_record.get_values('856', 'u')] - - # remove extra chars if any - try: - dl_vals = [x[:len(fn)-fn[::-1].index('.')-1] + fn[-fn[::-1].index('.')-1:len(fn)] for x in dl_vals] - except ValueError: - pass - except Exception as e: - print(f'Error: {dlx_record.id}') - raise e - - if fn not in dl_vals: + + if fn not in _get_dl_856(fn): print(f'{dlx_record.id}: FILE NOT FOUND ' + url) return export_whole_record(args, dlx_record, export_type='UPDATE') @@ -637,11 +642,13 @@ def normalize(x): return unicodedata.normalize('NFD', x) for uri in uris: if files := list(File.find_by_identifier(Identifier('uri', uri))): latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0] - - else: - pass #print(bib.id) + # filename and size should be same in DL + fn = uri.split('/')[-1] + if fn not in _get_dl_856(fn): + print(f'{dlx_record.id}: FILE NOT FOUND ' + uri) + return export_whole_record(args, dlx_record, export_type='UPDATE') # official doc files symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []