diff --git a/docs/changelog.md b/docs/changelog.md index 4a6e8e4..845c703 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -20,6 +20,8 @@ - Fixed the scrape crawling not correctly recognising when duplicate URLs were encountered. Previously duplicates would be included, but only one would be used. Now, they will be correctly logged. As a result of this change, the `SCRAPE_CRAWL_VERSION` has been incremented, meaning running extraction on a scrape will cause it to be re-crawled. - Fixed the return type annotation `LangPicker.get_root()`: it is now annotated as (`bs4.Tag` or `None`) instead of `bs4.PageElement`. This shouldn't be a breaking change, as the expected return type of this function was always a `Tag` object or `None`. - Type of `TranslationLink.lang` changed to reflect that it can accept a string to resolve or an already resolved `Language` instance +- Fixed downloading throwing an error stating the WordPress v2 API was not supported in other error cases +- Fixed the maximum redirects permitted not being set properly, meaning the effective value was always 30 **Documentation** diff --git a/src/wpextract/download/requestsession.py b/src/wpextract/download/requestsession.py index 38c5025..094fd7b 100644 --- a/src/wpextract/download/requestsession.py +++ b/src/wpextract/download/requestsession.py @@ -136,12 +136,6 @@ class HTTPTooManyRedirects(Exception): def _handle_status(url: str, status_code: int, n_tries: Optional[int] = None) -> None: - if 300 <= status_code < 400: - logging.error( - f'Too many redirects (status code {status_code}) while fetching "{url}"' - ) - raise HTTPTooManyRedirects - if status_code in ERROR_NAMES: log_msg = ( f'Error {status_code} ({ERROR_NAMES[status_code]}) while fetching "{url}"' @@ -237,19 +231,16 @@ def __init__( self.s.auth = authorization self.wait = wait self.timeout = timeout - self._mount_retry(backoff_factor, max_redirects, max_retries) + self.s.max_redirects = max_redirects + self._mount_retry(backoff_factor, max_retries) self.waiter = RequestWait(wait, random_wait) self.user_agent = user_agent if user_agent is not None else DEFAULT_UA - def _mount_retry( - self, backoff_factor: float, max_redirects: int, max_retries: int - ) -> None: + def _mount_retry(self, backoff_factor: float, max_retries: int) -> None: retry = Retry( total=max_retries, backoff_factor=backoff_factor, - redirect=max_redirects, status_forcelist=RETRY_AFTER_STATUS, - raise_on_redirect=False, raise_on_status=False, ) adapter = HTTPAdapter(max_retries=retry) @@ -329,6 +320,9 @@ def do_request( except requests.Timeout as e: logging.error(f"Request timed out fetching {url}") raise ConnectionTimeout from e + except requests.TooManyRedirects as e: + logging.error(f'Too many redirects while fetching "{url}"') + raise HTTPTooManyRedirects from e # If this is an HTTP 400 due to an invalid page, raise this special error early if response.status_code == 400 and "application/json" in response.headers.get( diff --git a/tests/conftest.py b/tests/conftest.py index 56c21f3..655defb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,3 +38,9 @@ def mocked_responses(): def mocked_responses_optional(): with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: yield rsps + + +@pytest.fixture() +def mocked_responses_ordered(): + with responses.RequestsMock(registry=responses.registries.OrderedRegistry) as rsps: + yield rsps diff --git a/tests/download/test_requestsession.py b/tests/download/test_requestsession.py new file mode 100644 index 0000000..71a7976 --- /dev/null +++ b/tests/download/test_requestsession.py @@ -0,0 +1,118 @@ +import pytest +from responses import matchers +from wpextract.download import RequestSession +from wpextract.download.requestsession import ( + HTTPError404, + HTTPError500, + HTTPTooManyRedirects, +) + + +def test_request_session_get(mocked_responses): + sess = RequestSession() + mocked_responses.get("https://example.org", body="Example response") + + resp = sess.get("https://example.org") + assert resp.text == "Example response" + + +def test_request_session_post(mocked_responses): + sess = RequestSession() + mocked_responses.post( + "https://example.org", + match=[matchers.urlencoded_params_matcher({"foo": "bar"})], + body="Example response", + ) + + resp = sess.post("https://example.org", {"foo": "bar"}) + assert resp.text == "Example response" + + +def test_no_retry_404_error(mocked_responses): + sess = RequestSession() + mocked_responses.get("https://example.org", status=404) + + with pytest.raises(HTTPError404): + sess.get("https://example.org") + + mocked_responses.assert_call_count("https://example.org", 1) + + +def test_no_retries(mocked_responses, caplog): + sess_no_retries = RequestSession(max_retries=0) + mocked_responses.get("https://example.org/a", status=500) + + with pytest.raises(HTTPError500): + sess_no_retries.get("https://example.org/a") + mocked_responses.assert_call_count("https://example.org/a", 1) + + +def test_retries(mocked_responses, caplog): + sess_retries = RequestSession(max_retries=5) + mocked_responses.get("https://example.org/b", status=500) + + with pytest.raises(HTTPError500): + sess_retries.get("https://example.org/b") + mocked_responses.assert_call_count("https://example.org/b", 6) + + +def test_timeout(mocked_responses): + sess = RequestSession(timeout=1) + mocked_responses.get( + "https://example.org", + body="Example response", + match=[matchers.request_kwargs_matcher({"timeout": 1})], + ) + + sess.get("https://example.org") + + +@pytest.fixture() +def mocked_sleep(mocker): + return mocker.patch("wpextract.download.requestsession.time.sleep") + + +def test_wait(mocked_responses, mocked_sleep): + sess = RequestSession(wait=1) + mocked_responses.get("https://example.org", body="Example response") + + sess.get("https://example.org") + + mocked_sleep.assert_called_once_with(1) + + +def test_rand_wait(mocked_responses, mocked_sleep): + sess = RequestSession(wait=1, random_wait=True) + mocked_responses.get("https://example.org", body="Example response") + sess.get("https://example.org") + + mocked_sleep.assert_called_once() + assert mocked_sleep.call_args[0][0] >= 0.5 + assert mocked_sleep.call_args[0][0] <= 1.5 + + +def test_max_redirects(mocked_responses): + for i in range(1, 5): + mocked_responses.get( + f"https://example.org/{i}", + status=301, + headers={"Location": f"https://example.org/{i+1}"}, + ) + + sess = RequestSession(max_redirects=3) + with pytest.raises(HTTPTooManyRedirects): + sess.get("https://example.org/1") + + +def test_max_redirects_within_limit(mocked_responses): + for i in range(1, 5): + mocked_responses.get( + f"https://example.org/{i}", + status=301, + headers={"Location": f"https://example.org/{i+1}"}, + ) + mocked_responses.get("https://example.org/5", body="Example response") + sess = RequestSession(max_redirects=5) + resp = sess.get("https://example.org/1") + assert resp.status_code == 200 + assert resp.text == "Example response"