From f862b3eb73c6cc3f5b56cdac037a1db887e32a30 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 1 Nov 2024 15:44:35 +0100 Subject: [PATCH] stream request --- tests/downloads_tests.py | 9 +++++++++ trafilatura/downloads.py | 34 +++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index 392b8499..a65afa53 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -150,6 +150,15 @@ def test_fetch(): if HAS_PYCURL: assert _send_pycurl_request(*args) is None + # test MAX_FILE_SIZE + backup = ZERO_CONFIG.getint('DEFAULT', 'MAX_FILE_SIZE') + ZERO_CONFIG.set('DEFAULT', 'MAX_FILE_SIZE', '1') + args = ('https://httpbun.com/html', True, False, ZERO_CONFIG) + assert _send_urllib_request(*args) is None + if HAS_PYCURL: + assert _send_pycurl_request(*args) is None + ZERO_CONFIG.set('DEFAULT', 'MAX_FILE_SIZE', str(backup)) + # reset global objects again to avoid affecting other tests _reset_downloads_global_objects() diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index a57cdd08..d589b45d 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -178,7 +178,6 @@ def _send_urllib_request( if no_ssl is False: if not HTTP_POOL: HTTP_POOL = create_pool( - retries=_get_retry_strategy(config), timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"), ca_certs=certifi.where() ) # cert_reqs='CERT_REQUIRED' @@ -186,29 +185,38 @@ def _send_urllib_request( else: if not NO_CERT_POOL: NO_CERT_POOL = create_pool( - retries=_get_retry_strategy(config), timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"), cert_reqs="CERT_NONE" ) pool_manager = NO_CERT_POOL - # execute request - # TODO: read by streaming chunks (stream=True) - # stop downloading as soon as MAX_FILE_SIZE is reached + + # execute request, stop downloading as soon as MAX_FILE_SIZE is reached response = pool_manager.request( - "GET", url, headers=_determine_headers(config), retries=RETRY_STRATEGY + "GET", + url, + headers=_determine_headers(config), + retries=_get_retry_strategy(config), + preload_content=False ) + data = bytearray() + for chunk in response.stream(2**17): + data.extend(chunk) + if len(data) > config.getint("DEFAULT", "MAX_FILE_SIZE"): + raise ValueError("MAX_FILE_SIZE exceeded") + response.release_conn() + + # necessary for standardization + resp = Response(bytes(data), response.status, response.geturl()) + if with_headers: + resp.store_headers(response.headers) + return resp + except urllib3.exceptions.SSLError: LOGGER.warning("retrying after SSLError: %s", url) return _send_urllib_request(url, True, with_headers, config) except Exception as err: LOGGER.error("download error: %s %s", url, err) # sys.exc_info()[0] - else: - # necessary for standardization - resp = Response(response.data, response.status, response.geturl()) - if with_headers: - resp.store_headers(response.headers) - return resp - # catchall + return None