Skip to content

Commit

Permalink
Merge pull request #83 from dag-hammarskjold-library/jb/82
Browse files Browse the repository at this point in the history
Use 561 to loook for files
  • Loading branch information
aaronhelton authored Apr 15, 2024
2 parents 4be1325 + a34b022 commit 0b782d0
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 23 deletions.
31 changes: 31 additions & 0 deletions dlx_dl/scripts/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ def process_bib(bib, *, blacklisted, files_only):

bib.delete_field('005')
bib = _035(bib)
bib = _561(bib)
bib = _856(bib)

if bib.get_value('980', 'a') == 'DELETED':
Expand Down Expand Up @@ -381,6 +382,34 @@ def _035(record):

return record

def _561(bib):
uris = bib.get_values('561', 'u')
place, seen = 0, []

for uri in uris:
if files := list(File.find_by_identifier(Identifier('uri', uri))):
latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]
_fft = Datafield('FFT', record_type='bib')
_fft.set('a', latest.uri)

old_fn = latest.filename if latest.filename else uri.split('/')[-1]
new_fn = clean_fn(old_fn)
parts = new_fn.split('.')
base = ''.join(parts[0:-1])

if base in seen:
# files can't have the same base name regardless of extension
ext = parts[-1]
new_fn = f'{base}_{place}.{ext}'
else:
seen.append(base)

_fft.set('n', new_fn)
bib.fields.append(_fft)
place += 1

return bib

def _856(bib):
place = len(bib.get_fields('FFT'))
seen = []
Expand All @@ -390,6 +419,7 @@ def _856(bib):
parsed = urlparse(url)

if parsed.netloc in WHITELIST:
# whitelist contains domains of file urls to create FFTs from
url_path = parsed.path.rstrip()

if unquote(url_path) == url_path:
Expand All @@ -402,6 +432,7 @@ def _856(bib):
base = ''.join(parts[0:-1])

if base in seen:
# files can't have the same base name regardless of extension
ext = parts[-1]
new_fn = f'{base}_{place}.{ext}'
else:
Expand Down
65 changes: 42 additions & 23 deletions dlx_dl/scripts/sync/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
else {'updated': criteria}

# sort to ensure latest updates are checked first
rset = cls.from_query(query, sort=[('updated', DESC)])
rset = cls.from_query(query, sort=[('updated', DESC)], collation=Config.marc_index_default_collation)

return rset

Expand Down Expand Up @@ -378,10 +378,10 @@ def get_records(args, log=None, queue=None):
records = cls.from_query({'_id': {'$in': ids}})
elif args.query:
query = args.query.replace('\'', '"')
records = cls.from_query(json.loads(query))
records = cls.from_query(json.loads(query), collation=Config.marc_index_default_collation)
elif args.querystring:
query = Query.from_string(args.querystring, record_type=args.type)
records = cls.from_query(query)
records = cls.from_query(query, collation=Config.marc_index_default_collation)
else:
raise Exception('One of the criteria arguments is required')

Expand Down Expand Up @@ -586,6 +586,31 @@ def normalize(x): return unicodedata.normalize('NFD', x)

seen.append(field.to_mrk())

# for comparing the filenames from dl record 856 with dlx filename
def _get_dl_856(fn):
fn = export.clean_fn(fn)

# chars requiring encoding
fn = fn.replace('%', '%25')
#fn = fn.replace('^', '%5E')
#fn = quote(fn)

if unquote(fn) == fn:
fn = quote(fn)

dl_vals = [x.split('/')[-1] for x in dl_record.get_values('856', 'u')]

# remove extra chars if any
try:
dl_vals = [x[:len(fn)-fn[::-1].index('.')-1] + fn[-fn[::-1].index('.')-1:len(fn)] for x in dl_vals]
except ValueError:
pass
except Exception as e:
print(f'Error: {dlx_record.id}')
raise e

return dl_vals

# collector tool files
for field in dlx_record.get_fields('856'):
if field.get_value('3') == 'Thumbnail':
Expand All @@ -605,32 +630,26 @@ def normalize(x): return unicodedata.normalize('NFD', x)
return export_whole_record(args, dlx_record, export_type='UPDATE')

fn = url.split('/')[-1]
fn = export.clean_fn(fn)

# chars requiring encoding
fn = fn.replace('%', '%25')
#fn = fn.replace('^', '%5E')
#fn = quote(fn)

if fn not in _get_dl_856(fn):
print(f'{dlx_record.id}: FILE NOT FOUND ' + url)

if unquote(fn) == fn:
fn = quote(fn)
return export_whole_record(args, dlx_record, export_type='UPDATE')

dl_vals = [x.split('/')[-1] for x in dl_record.get_values('856', 'u')]
# records with file URI in 561
uris = dlx_record.get_values('561', 'u')

# remove extra chars if any
try:
dl_vals = [x[:len(fn)-fn[::-1].index('.')-1] + fn[-fn[::-1].index('.')-1:len(fn)] for x in dl_vals]
except ValueError:
pass
except Exception as e:
print(f'Error: {dlx_record.id}')
raise e
for uri in uris:
if files := list(File.find_by_identifier(Identifier('uri', uri))):
latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]
# filename and size should be same in DL
fn = uri.split('/')[-1]

if fn not in dl_vals:
print(f'{dlx_record.id}: FILE NOT FOUND ' + url)
if fn not in _get_dl_856(fn):
print(f'{dlx_record.id}: FILE NOT FOUND ' + uri)

return export_whole_record(args, dlx_record, export_type='UPDATE')

# official doc files
symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []
#symbols = dlx_record.get_values('191', 'a') if args.type == 'bib' else []
Expand Down

0 comments on commit 0b782d0

Please sign in to comment.