Skip to content

Commit

Permalink
stream request
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 1, 2024
1 parent 236dff0 commit f862b3e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 13 deletions.
9 changes: 9 additions & 0 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,15 @@ def test_fetch():
if HAS_PYCURL:
assert _send_pycurl_request(*args) is None

# test MAX_FILE_SIZE
backup = ZERO_CONFIG.getint('DEFAULT', 'MAX_FILE_SIZE')
ZERO_CONFIG.set('DEFAULT', 'MAX_FILE_SIZE', '1')
args = ('https://httpbun.com/html', True, False, ZERO_CONFIG)
assert _send_urllib_request(*args) is None
if HAS_PYCURL:
assert _send_pycurl_request(*args) is None
ZERO_CONFIG.set('DEFAULT', 'MAX_FILE_SIZE', str(backup))

# reset global objects again to avoid affecting other tests
_reset_downloads_global_objects()

Expand Down
34 changes: 21 additions & 13 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,37 +178,45 @@ def _send_urllib_request(
if no_ssl is False:
if not HTTP_POOL:
HTTP_POOL = create_pool(
retries=_get_retry_strategy(config),
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
ca_certs=certifi.where()
) # cert_reqs='CERT_REQUIRED'
pool_manager = HTTP_POOL
else:
if not NO_CERT_POOL:
NO_CERT_POOL = create_pool(
retries=_get_retry_strategy(config),
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
cert_reqs="CERT_NONE"
)
pool_manager = NO_CERT_POOL
# execute request
# TODO: read by streaming chunks (stream=True)
# stop downloading as soon as MAX_FILE_SIZE is reached

# execute request, stop downloading as soon as MAX_FILE_SIZE is reached
response = pool_manager.request(
"GET", url, headers=_determine_headers(config), retries=RETRY_STRATEGY
"GET",
url,
headers=_determine_headers(config),
retries=_get_retry_strategy(config),
preload_content=False
)
data = bytearray()
for chunk in response.stream(2**17):
data.extend(chunk)
if len(data) > config.getint("DEFAULT", "MAX_FILE_SIZE"):
raise ValueError("MAX_FILE_SIZE exceeded")
response.release_conn()

# necessary for standardization
resp = Response(bytes(data), response.status, response.geturl())
if with_headers:
resp.store_headers(response.headers)
return resp

except urllib3.exceptions.SSLError:
LOGGER.warning("retrying after SSLError: %s", url)
return _send_urllib_request(url, True, with_headers, config)
except Exception as err:
LOGGER.error("download error: %s %s", url, err) # sys.exc_info()[0]
else:
# necessary for standardization
resp = Response(response.data, response.status, response.geturl())
if with_headers:
resp.store_headers(response.headers)
return resp
# catchall

return None


Expand Down

0 comments on commit f862b3e

Please sign in to comment.