diff --git a/dlx_dl/scripts/export/__init__.py b/dlx_dl/scripts/export/__init__.py index 91f8325..677e63e 100644 --- a/dlx_dl/scripts/export/__init__.py +++ b/dlx_dl/scripts/export/__init__.py @@ -527,6 +527,18 @@ def _new_file_symbols(date_from, date_to=None): fft_symbols.append(idx['value']) return list(set(fft_symbols)) + +def _new_file_uris(date_from: datetime, date_to=None) -> list: + uris = [] + criteria = {'$gte': date_from} + date_to and criteria.setdefault('$lte', date_to) + + for f in DB.files.find({'$or': [{'timestamp': criteria}, {'updated': criteria}]}): + for idx in f['identifiers']: + if idx['type'] == 'uri' and idx['value'] != '' and idx['value'] != ' ' and idx['value'] != '***': # note: clean these up in db + uris.append(idx['value']) + + return list(set(uris)) def _fft_from_files(bib): symbols = bib.get_values('191', 'a') + bib.get_values('191', 'z') diff --git a/dlx_dl/scripts/sync/__init__.py b/dlx_dl/scripts/sync/__init__.py index 650a111..fe76ba9 100644 --- a/dlx_dl/scripts/sync/__init__.py +++ b/dlx_dl/scripts/sync/__init__.py @@ -248,11 +248,16 @@ def run(**kwargs): retries = 0 while response.status_code != 200: - print('retrying') + print('retrying') + if retries > 5: raise Exception(f'search API error: {response.text}') - time.sleep(5 * retries) + if retries == 0: + time.sleep(5) + else: + time.sleep(300) + retries += 1 response = requests.get(url, headers=HEADERS) @@ -354,12 +359,14 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False): if cls == BibSet and not delete_only: fft_symbols = export._new_file_symbols(date_from, date_to) - if len(fft_symbols) > 10000: + if len(fft_symbols) > 100_000: raise Exception('that\'s too many file symbols to look up, sorry :(') print(f'found files for {len(fft_symbols)} symbols') else: fft_symbols = None + + fft_uris = export._new_file_uris(date_from, date_to) if date_to: criteria = {'$and': [{'updated': {'$gte': date_from}}, {'updated': {'$lte': date_to}}]} @@ -368,10 +375,15 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False): criteria = {'updated': {'$gte': date_from}} history_criteria = {'deleted.time': {'$gte': date_from}, 'deleted.user': {'$ne': 'HZN'}} - history_criteria - - if cls == BibSet and fft_symbols: - query = {'$or': [criteria, {'191.subfields.value': {'$in': fft_symbols}}]} + if cls == BibSet: + if fft_symbols: + query = { + '$or': [ + criteria, + {'191.subfields.value': {'$in': fft_symbols}}, + {'561.subfields.value': {'$in': fft_uris}}, + ] + } else: query = criteria