Skip to content

Commit

Permalink
Moves get_pdf out of ensure_pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
bdc34 committed Oct 10, 2023
1 parent 6972fb7 commit 065299d
Showing 1 changed file with 27 additions and 20 deletions.
47 changes: 27 additions & 20 deletions script/sync_prod_to_gcp/sync_published_to_gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,30 @@ def path_to_bucket_key(pdf) -> str:


@retry.Retry(predicate=retry.if_exception_type(PDF_RETRY_EXCEPTIONS))
def get_pdf(session, pdf_url) -> None:
headers = {'User-Agent': ENSURE_UA}
logger.debug("Getting %s", pdf_url)
resp = session.get(pdf_url, headers=headers, stream=True, verify=ENSURE_CERT_VERIFY)
# noinspection PyStatementEffect
[line for line in resp.iter_lines()] # Consume resp in hopes of keeping alive session
if resp.status_code == 503:
msg = f"ensure_pdf: GET status 503, server overloaded {pdf_url}"
logger.warning(msg,
extra={CATEGORY: "download",
"url": pdf_url, "status_code": resp.status_code})
raise Overloaded503Exception(msg)
if resp.status_code != 200:
msg = f"ensure_pdf: GET status {resp.status_code} {pdf_url}"
logger.warning(msg,
extra={CATEGORY: "download",
"url": pdf_url, "status_code": resp.status_code})
raise (Exception(msg))
else:
logger.info(f"ensure_pdf: Success GET status {resp.status_code} {pdf_url}",
extra={CATEGORY: "download",
"url": pdf_url, "status_code": resp.status_code})


def ensure_pdf(session, host, arxiv_id):
"""Ensures PDF exits for arxiv_id.
Expand All @@ -309,47 +333,30 @@ def ensure_pdf(session, host, arxiv_id):
archive = ('arxiv' if not arxiv_id.is_old_id else arxiv_id.archive)
pdf_file = Path(f"{PS_CACHE_PREFIX}/{archive}/pdf/{arxiv_id.yymm}/{arxiv_id.filename}v{arxiv_id.version}.pdf")
url = f"https://{host}/pdf/{arxiv_id.filename}v{arxiv_id.version}.pdf"

start = perf_counter()

if pdf_file.exists():
logger.debug(f"ensure_file_url_exists: {str(pdf_file)} already exists")
return pdf_file, url, "already exists", ms_since(start)

start = perf_counter()
headers = {'User-Agent': ENSURE_UA}
logger.debug("Getting %s", url)
resp = session.get(url, headers=headers, stream=True, verify=ENSURE_CERT_VERIFY)
# noinspection PyStatementEffect
[line for line in resp.iter_lines()] # Consume resp in hopes of keeping alive session
if resp.status_code == 503:
msg = f"ensure_pdf: GET status 503, server overloaded {url}"
logger.warning(msg,
extra={CATEGORY: "download",
"url": url, "status_code": resp.status_code, "pdf_file": str(pdf_file)})
raise Overloaded503Exception(msg)
if resp.status_code != 200:
msg = f"ensure_pdf: GET status {resp.status_code} {url}"
logger.warning(msg,
extra={CATEGORY: "download",
"url": url, "status_code": resp.status_code, "pdf_file": str(pdf_file)})
raise (Exception(msg))
get_pdf(session, url)
start_wait = perf_counter()
while not pdf_file.exists():
if perf_counter() - start_wait > PDF_WAIT_SEC:
msg = f"No PDF, waited longer than {PDF_WAIT_SEC} sec {url}"
logger.warning(msg,
extra={CATEGORY: "download",
"url": url, "pdf_file": str(pdf_file)})
raise (WaitTimeout(msg))
raise (Exception(msg))
else:
sleep(0.2)
if pdf_file.exists():
logger.debug(
f"ensure_file_url_exists: {str(pdf_file)} requested {url} status_code {resp.status_code} {ms_since(start)} ms")
return pdf_file, url, None, ms_since(start)
else:
raise (NoPdfFile(f"ensure_pdf: Could not create {pdf_file}. {url} {ms_since(start)} ms"))
raise (Exception(f"ensure_pdf: Could not create {pdf_file}. {url} {ms_since(start)} ms"))


def upload_pdf(gs_client, ensure_tuple):
Expand Down

0 comments on commit 065299d

Please sign in to comment.