Skip to content

Commit

Permalink
Compare number of files in each system (#175)
Browse files Browse the repository at this point in the history
* compare extra/deleted DL files

* implement --missing_only

* account for whitelist in number of files comp
  • Loading branch information
jbukhari authored Dec 20, 2024
1 parent 980c47a commit dc52b06
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 10 deletions.
3 changes: 3 additions & 0 deletions dlx_dl/scripts/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,9 @@ def _980(record):
if atag == '110':
if record.heading_field.get_value('9') == 'ms':
record.set('980', 'a', 'MEMBER', address=['+'])
elif atag == '150':
if record.heading_field.indicators[0] == "9" or 'http://metadata.un.org/thesaurus' not in record.get_values('035', 'a'):
record.set('980', 'a', 'GEOGRAPHIC')

return record

Expand Down
45 changes: 35 additions & 10 deletions dlx_dl/scripts/sync/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ def get_args(**kwargs):
parser.add_argument('--modified_since_log', action='store_true')
parser.add_argument('--limit', help='limit the number of exports', type=int, default=1000)
parser.add_argument('--time_limit', help='runtime limit in seconds', type=int, default=600)
parser.add_argument('--queue', action='store_true', help='try to export ercords in queue and add to queue if export exceeds limits')
parser.add_argument('--queue', action='store_true', help='try to export records in queue and add to queue if export exceeds limits')
parser.add_argument('--delete_only', action='store_true')
parser.add_argument('--use_auth_cache', action='store_true')
parser.add_argument('--use_api', action='store_true')
parser.add_argument('--missing_only', action='store_true')

r = parser.add_argument_group('required')
r.add_argument('--source', required=True, help='an identity to use in the log')
Expand Down Expand Up @@ -258,15 +258,16 @@ def run(**kwargs) -> int:
retries = 0

while response.status_code != 200:
print('retrying')
print(f'retrying: {url}\n{response.text}')

if retries > 5:
raise Exception(f'search API error: {response.text}')

if retries == 0:
time.sleep(5)

if 'Max 100 requests per 5 minutes' in json.loads(response.text).get('error'):
print('API rate limit exceeded. waiting 5 minutes')
time.sleep(310)
else:
time.sleep(300)
time.sleep((retries if retries else 1) * 5)

retries += 1
response = requests.get(url, headers=HEADERS)
Expand Down Expand Up @@ -305,6 +306,12 @@ def run(**kwargs) -> int:

# remove from queue
to_remove.append(dlx_record.id)

# end here if only adding missing records
if args.missing_only:
# clear batch
BATCH = []
continue

# scan and compare DL records
for dl_record in DL_BATCH:
Expand Down Expand Up @@ -672,12 +679,19 @@ def _get_dl_856(fn):

return export_whole_record(args, dlx_record, export_type='UPDATE')

# for comparing number of files in each system
all_dlx_files = []

# records with file URI in 561
uris = dlx_record.get_values('561', 'u')

for uri in uris:
if files := list(File.find_by_identifier(Identifier('uri', uri))):
latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]

if f.id not in [x.id for x in all_dlx_files]:
all_dlx_files.append(latest)

# filename and size should be same in DL
fn = uri.split('/')[-1]

Expand All @@ -690,11 +704,13 @@ def _get_dl_856(fn):
symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []

for symbol in set(symbols):
if symbol == '' or symbol == ' ' or symbol == '***': # note: clean these up in db
continue

if symbol == '' or symbol == ' ' or symbol == '***': continue # note: clean these up in db

for lang in ('AR', 'ZH', 'EN', 'FR', 'RU', 'ES', 'DE'):
if f := File.latest_by_identifier_language(Identifier('symbol', symbol), lang):
if f.id not in [x.id for x in all_dlx_files]:
all_dlx_files.append(f)

field = next(filter(lambda x: re.search(fr'{lang}\.\w+$', x.get_value('u')), dl_record.get_fields('856')), None)

if field:
Expand All @@ -713,6 +729,15 @@ def _get_dl_856(fn):

return export_whole_record(args, dlx_record, export_type='UPDATE')

# check if there are a different number of files in DL than DLX
dl_files = [x for x in dl_record.get_fields('856') if re.match(r'http[s]?://digitallibrary.un.org', x.get_value('u'))]
# files that came from whitelisted 856 urls are not currently in the dlx filestore
dl_file_count = len(dl_files) - len([x for x in dlx_record.get_fields('856') if urlparse(x.get_value('u')).netloc in export.WHITELIST])

if dl_file_count != len(all_dlx_files):
print(f'EXTRA FILES DETECTED - {[x.to_mrk() for x in dl_files]}\n{[f.to_dict() for f in all_dlx_files]}')
return export_whole_record(args, dlx_record, export_type='UPDATE')

# run api submission
if take_tags or delete_fields:
record = Bib() if args.type == 'bib' else Auth()
Expand Down

0 comments on commit dc52b06

Please sign in to comment.