Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compare number of files in each system #175

Merged
merged 7 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dlx_dl/scripts/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,9 @@ def _980(record):
if atag == '110':
if record.heading_field.get_value('9') == 'ms':
record.set('980', 'a', 'MEMBER', address=['+'])
elif atag == '150':
if record.heading_field.indicators[0] == "9" or 'http://metadata.un.org/thesaurus' not in record.get_values('035', 'a'):
record.set('980', 'a', 'GEOGRAPHIC')

return record

Expand Down
45 changes: 35 additions & 10 deletions dlx_dl/scripts/sync/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ def get_args(**kwargs):
parser.add_argument('--modified_since_log', action='store_true')
parser.add_argument('--limit', help='limit the number of exports', type=int, default=1000)
parser.add_argument('--time_limit', help='runtime limit in seconds', type=int, default=600)
parser.add_argument('--queue', action='store_true', help='try to export ercords in queue and add to queue if export exceeds limits')
parser.add_argument('--queue', action='store_true', help='try to export records in queue and add to queue if export exceeds limits')
parser.add_argument('--delete_only', action='store_true')
parser.add_argument('--use_auth_cache', action='store_true')
parser.add_argument('--use_api', action='store_true')
parser.add_argument('--missing_only', action='store_true')

r = parser.add_argument_group('required')
r.add_argument('--source', required=True, help='an identity to use in the log')
Expand Down Expand Up @@ -258,15 +258,16 @@ def run(**kwargs) -> int:
retries = 0

while response.status_code != 200:
print('retrying')
print(f'retrying: {url}\n{response.text}')

if retries > 5:
raise Exception(f'search API error: {response.text}')

if retries == 0:
time.sleep(5)

if 'Max 100 requests per 5 minutes' in json.loads(response.text).get('error'):
print('API rate limit exceeded. waiting 5 minutes')
time.sleep(310)
else:
time.sleep(300)
time.sleep((retries if retries else 1) * 5)

retries += 1
response = requests.get(url, headers=HEADERS)
Expand Down Expand Up @@ -305,6 +306,12 @@ def run(**kwargs) -> int:

# remove from queue
to_remove.append(dlx_record.id)

# end here if only adding missing records
if args.missing_only:
# clear batch
BATCH = []
continue

# scan and compare DL records
for dl_record in DL_BATCH:
Expand Down Expand Up @@ -672,12 +679,19 @@ def _get_dl_856(fn):

return export_whole_record(args, dlx_record, export_type='UPDATE')

# for comparing number of files in each system
all_dlx_files = []

# records with file URI in 561
uris = dlx_record.get_values('561', 'u')

for uri in uris:
if files := list(File.find_by_identifier(Identifier('uri', uri))):
latest = sorted(files, key=lambda x: x.timestamp, reverse=True)[0]

if f.id not in [x.id for x in all_dlx_files]:
all_dlx_files.append(latest)

# filename and size should be same in DL
fn = uri.split('/')[-1]

Expand All @@ -690,11 +704,13 @@ def _get_dl_856(fn):
symbols = (dlx_record.get_values('191', 'a') + dlx_record.get_values('191', 'z')) if args.type == 'bib' else []

for symbol in set(symbols):
if symbol == '' or symbol == ' ' or symbol == '***': # note: clean these up in db
continue

if symbol == '' or symbol == ' ' or symbol == '***': continue # note: clean these up in db

for lang in ('AR', 'ZH', 'EN', 'FR', 'RU', 'ES', 'DE'):
if f := File.latest_by_identifier_language(Identifier('symbol', symbol), lang):
if f.id not in [x.id for x in all_dlx_files]:
all_dlx_files.append(f)

field = next(filter(lambda x: re.search(fr'{lang}\.\w+$', x.get_value('u')), dl_record.get_fields('856')), None)

if field:
Expand All @@ -713,6 +729,15 @@ def _get_dl_856(fn):

return export_whole_record(args, dlx_record, export_type='UPDATE')

# check if there are a different number of files in DL than DLX
dl_files = [x for x in dl_record.get_fields('856') if re.match(r'http[s]?://digitallibrary.un.org', x.get_value('u'))]
# files that came from whitelisted 856 urls are not currently in the dlx filestore
dl_file_count = len(dl_files) - len([x for x in dlx_record.get_fields('856') if urlparse(x.get_value('u')).netloc in export.WHITELIST])

if dl_file_count != len(all_dlx_files):
print(f'EXTRA FILES DETECTED - {[x.to_mrk() for x in dl_files]}\n{[f.to_dict() for f in all_dlx_files]}')
return export_whole_record(args, dlx_record, export_type='UPDATE')

# run api submission
if take_tags or delete_fields:
record = Bib() if args.type == 'bib' else Auth()
Expand Down
Loading