Skip to content

Commit

Permalink
timeout for fetch_url
Browse files Browse the repository at this point in the history
  • Loading branch information
Barbara Miller committed Feb 9, 2025
1 parent 7ededbc commit 65de0d2
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class BrozzlerWorker:
HEARTBEAT_INTERVAL = 200.0
SITE_SESSION_MINUTES = 15
HEADER_REQUEST_TIMEOUT = 30
FETCH_URL_TIMEOUT = 60

def __init__(
self,
Expand Down Expand Up @@ -334,6 +335,7 @@ def _get_page_headers(self, page):
# bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
try:
self.logger.info("getting page headers for %s", page.url)
with requests.get(
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
) as r:
Expand Down Expand Up @@ -485,8 +487,14 @@ def _fetch_url(self, site, url=None, page=None):
try:
# response is ignored
requests.get(
url, proxies=proxies, headers=site.extra_headers(page), verify=False
url,
proxies=proxies,
headers=site.extra_headers(page),
verify=False,
timeout=self.FETCH_URL_TIMEOUT,
)
except requests.exceptions.Timeout as e:
self.logger.warning("Timed out fetching %s: %s", page.url, e)
except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e

Expand Down

0 comments on commit 65de0d2

Please sign in to comment.