Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Look for new files by uri #165

Merged
merged 2 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions dlx_dl/scripts/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,18 @@ def _new_file_symbols(date_from, date_to=None):
fft_symbols.append(idx['value'])

return list(set(fft_symbols))

def _new_file_uris(date_from: datetime, date_to=None) -> list:
uris = []
criteria = {'$gte': date_from}
date_to and criteria.setdefault('$lte', date_to)

for f in DB.files.find({'$or': [{'timestamp': criteria}, {'updated': criteria}]}):
for idx in f['identifiers']:
if idx['type'] == 'uri' and idx['value'] != '' and idx['value'] != ' ' and idx['value'] != '***': # note: clean these up in db
uris.append(idx['value'])

return list(set(uris))

def _fft_from_files(bib):
symbols = bib.get_values('191', 'a') + bib.get_values('191', 'z')
Expand Down
26 changes: 19 additions & 7 deletions dlx_dl/scripts/sync/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,11 +248,16 @@ def run(**kwargs):
retries = 0

while response.status_code != 200:
print('retrying')
print('retrying')

if retries > 5:
raise Exception(f'search API error: {response.text}')

time.sleep(5 * retries)
if retries == 0:
time.sleep(5)
else:
time.sleep(300)

retries += 1
response = requests.get(url, headers=HEADERS)

Expand Down Expand Up @@ -354,12 +359,14 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
if cls == BibSet and not delete_only:
fft_symbols = export._new_file_symbols(date_from, date_to)

if len(fft_symbols) > 10000:
if len(fft_symbols) > 100_000:
raise Exception('that\'s too many file symbols to look up, sorry :(')

print(f'found files for {len(fft_symbols)} symbols')
else:
fft_symbols = None

fft_uris = export._new_file_uris(date_from, date_to)

if date_to:
criteria = {'$and': [{'updated': {'$gte': date_from}}, {'updated': {'$lte': date_to}}]}
Expand All @@ -368,10 +375,15 @@ def get_records_by_date(cls, date_from, date_to=None, delete_only=False):
criteria = {'updated': {'$gte': date_from}}
history_criteria = {'deleted.time': {'$gte': date_from}, 'deleted.user': {'$ne': 'HZN'}}

history_criteria

if cls == BibSet and fft_symbols:
query = {'$or': [criteria, {'191.subfields.value': {'$in': fft_symbols}}]}
if cls == BibSet:
if fft_symbols:
query = {
'$or': [
criteria,
{'191.subfields.value': {'$in': fft_symbols}},
{'561.subfields.value': {'$in': fft_uris}},
]
}
else:
query = criteria

Expand Down
Loading