fix: adopt new version of curl-cffi (#543)

### Description - Version 1.7.2 of `curl-cffi` introduces breaking changes. - This update includes adopting the new version, adding type aliases for `Request` fields, and incorporating other minor changes from PR #542. ### Issues - N/A (ad-hoc fix) ### Testing - The current set of unit tests should cover the changes. ### Checklist - [x] CI passed
apify · Sep 25, 2024 · f6fcf48 · f6fcf48
1 parent 8a3d369
commit f6fcf48
Show file tree

Hide file tree

Showing 15 changed files with 76 additions and 40 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,7 @@ apify = { version = ">=2.0.0", optional = true }
 beautifulsoup4 = { version = ">=4.12.0", optional = true }
 colorama = ">=0.4.0"
 cookiecutter = ">=2.6.0"
-curl-cffi = { version = ">=0.7.0", optional = true }
+curl-cffi = { version = ">=0.7.2", optional = true }
 docutils = ">=0.21.0"
 eval-type-backport = ">=0.2.0"
 html5lib = { version = ">=1.0", optional = true }

diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py
@@ -20,7 +20,7 @@
 )
 from typing_extensions import Self
 
-from crawlee._types import EnqueueStrategy, HttpMethod
+from crawlee._types import EnqueueStrategy, HttpMethod, HttpPayload, HttpQueryParams
 from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
 from crawlee._utils.urls import extract_query_params, validate_http_url
 
@@ -117,14 +117,17 @@ class BaseRequestData(BaseModel):
     """
 
     method: HttpMethod = 'GET'
+    """HTTP request method."""
 
-    payload: str | None = None
+    headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
+    """HTTP request headers."""
 
-    headers: Annotated[dict[str, str] | None, Field(default_factory=dict)] = None
+    query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
+    """URL query parameters."""
 
-    query_params: Annotated[dict[str, Any] | None, Field(default_factory=dict)] = None
+    payload: HttpPayload | None = None
 
-    data: Annotated[dict[str, Any] | None, Field(default_factory=dict)] = None
+    data: Annotated[dict[str, Any], Field(default_factory=dict)] = {}
 
     user_data: Annotated[
         dict[str, JsonValue],  # Internally, the model contains `UserData`, this is just for convenience
@@ -139,7 +142,7 @@ class BaseRequestData(BaseModel):
                 exclude_defaults=True,
             )
         ),
-    ]
+    ] = {}
     """Custom user data assigned to the request. Use this to save any request related data to the
     request's scope, keeping them accessible on retries, failures etc.
     """
@@ -158,7 +161,7 @@ def from_url(
         url: str,
         *,
         method: HttpMethod = 'GET',
-        payload: str | None = None,
+        payload: HttpPayload | None = None,
         label: str | None = None,
         unique_key: str | None = None,
         id: str | None = None,
@@ -232,7 +235,7 @@ def from_url(
         url: str,
         *,
         method: HttpMethod = 'GET',
-        payload: str | None = None,
+        payload: HttpPayload | None = None,
         label: str | None = None,
         unique_key: str | None = None,
         id: str | None = None,

diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from collections.abc import Coroutine, Iterator, Mapping, Sequence
+from collections.abc import Mapping
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Literal, Protocol, Union
@@ -10,6 +10,7 @@
 if TYPE_CHECKING:
     import logging
     import re
+    from collections.abc import Coroutine, Iterator, Sequence
 
     from crawlee import Glob
     from crawlee._request import BaseRequestData, Request
@@ -26,6 +27,10 @@
 
 HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']
 
+HttpQueryParams: TypeAlias = dict[str, str]
+
+HttpPayload: TypeAlias = Union[str, bytes]
+
 
 class EnqueueStrategy(str, Enum):
     """Strategy for deciding which links should be followed and which ones should be ignored."""

diff --git a/src/crawlee/_utils/http.py b/src/crawlee/_utils/http.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 def is_status_code_error(value: int) -> bool:
     """Returns `True` for 4xx or 5xx status codes, `False` otherwise."""
     return is_status_code_client_error(value) or is_status_code_server_error(value)

diff --git a/src/crawlee/_utils/requests.py b/src/crawlee/_utils/requests.py
@@ -4,10 +4,14 @@
 from base64 import b64encode
 from hashlib import sha256
 from logging import getLogger
+from typing import TYPE_CHECKING
 from urllib.parse import parse_qsl, urlencode, urlparse
 
 from crawlee._utils.crypto import compute_short_hash
 
+if TYPE_CHECKING:
+    from crawlee._types import HttpMethod, HttpPayload
+
 logger = getLogger(__name__)
 
 
@@ -82,22 +86,22 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
 
 def compute_unique_key(
     url: str,
-    method: str = 'GET',
-    payload: str | bytes | None = None,
+    method: HttpMethod = 'GET',
+    payload: HttpPayload | None = None,
     *,
     keep_url_fragment: bool = False,
     use_extended_unique_key: bool = False,
 ) -> str:
     """Computes a unique key for caching & deduplication of requests.
 
     This function computes a unique key by normalizing the provided URL and method.
-    If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
+    If `use_extended_unique_key` is True and a payload is provided, the payload is hashed and
     included in the key. Otherwise, the unique key is just the normalized URL.
 
     Args:
         url: The request URL.
         method: The HTTP method, defaults to 'GET'.
-        payload: The request payload, defaults to None.
+        payload: The data to be sent as the request body, defaults to None.
         keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
         use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
 

diff --git a/src/crawlee/fingerprint_suite/_consts.py b/src/crawlee/fingerprint_suite/_consts.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 # ruff: noqa: E501
 
 COMMON_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'

diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py
@@ -2,15 +2,15 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Protocol
+from typing import TYPE_CHECKING, Any, Protocol
 
 from crawlee._utils.http import is_status_code_error
 from crawlee.errors import HttpStatusCodeError
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
-    from crawlee._types import HttpHeaders, HttpMethod
+    from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
     from crawlee.base_storage_client._models import Request
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.sessions import Session
@@ -114,6 +114,8 @@ async def send_request(
         *,
         method: HttpMethod = 'GET',
         headers: HttpHeaders | None = None,
+        query_params: HttpQueryParams | None = None,
+        data: dict[str, Any] | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
     ) -> HttpResponse:
@@ -125,6 +127,8 @@ async def send_request(
             url: The URL to send the request to.
             method: The HTTP method to use.
             headers: The headers to include in the request.
+            query_params: The query parameters to include in the request.
+            data: The data to be sent as the request body.
             session: The session associated with the request.
             proxy_info: The information about the proxy to be used.
 

diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py
@@ -16,7 +16,7 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
-    from crawlee._types import HttpMethod
+    from crawlee._types import HttpMethod, HttpQueryParams
     from crawlee.base_storage_client._models import Request
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.statistics import Statistics
@@ -166,7 +166,7 @@ async def send_request(
         *,
         method: HttpMethod = 'GET',
         headers: HttpHeaders | None = None,
-        query_params: dict[str, Any] | None = None,
+        query_params: HttpQueryParams | None = None,
         data: dict[str, Any] | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,

diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py
@@ -4,8 +4,9 @@
 
 try:
     from curl_cffi.requests import AsyncSession
-    from curl_cffi.requests.errors import RequestsError
-    from curl_cffi.requests.impersonate import BrowserType
+    from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
+    from curl_cffi.requests.exceptions import RequestException as CurlRequestError
+    from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
 except ImportError as exc:
     raise ImportError(
         "To import anything from this subpackage, you need to install the 'curl-impersonate' extra."
@@ -24,7 +25,7 @@
 
     from curl_cffi.requests import Response
 
-    from crawlee._types import HttpHeaders, HttpMethod
+    from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
     from crawlee.base_storage_client._models import Request
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.sessions import Session
@@ -116,14 +117,14 @@ async def crawl(
         try:
             response = await client.request(
                 url=request.url,
-                method=request.method.upper(),  # curl-cffi requires uppercase method
+                method=request.method.upper(),  # type: ignore # curl-cffi requires uppercase method
                 headers=request.headers,
                 params=request.query_params,
-                data=request.data,
+                data=request.payload,
                 cookies=session.cookies if session else None,
                 allow_redirects=True,
             )
-        except RequestsError as exc:
+        except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
             raise
@@ -150,7 +151,7 @@ async def send_request(
         *,
         method: HttpMethod = 'GET',
         headers: HttpHeaders | None = None,
-        query_params: dict[str, Any] | None = None,
+        query_params: HttpQueryParams | None = None,
         data: dict[str, Any] | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
@@ -161,14 +162,14 @@ async def send_request(
         try:
             response = await client.request(
                 url=url,
-                method=method.upper(),  # curl-cffi requires uppercase method
+                method=method.upper(),  # type: ignore # curl-cffi requires uppercase method
                 headers=headers,
                 params=query_params,
                 data=data,
                 cookies=session.cookies if session else None,
                 allow_redirects=True,
             )
-        except RequestsError as exc:
+        except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
             raise
@@ -194,7 +195,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
             # are set as default options.
             kwargs: dict[str, Any] = {
                 'proxy': proxy_url,
-                'impersonate': BrowserType.chrome,
+                'impersonate': CURL_DEFAULT_CHROME,
             }
 
             # Update the default kwargs with any additional user-provided kwargs.
@@ -206,13 +207,12 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
         return self._client_by_proxy_url[proxy_url]
 
     @staticmethod
-    def _is_proxy_error(error: RequestsError) -> bool:
+    def _is_proxy_error(error: CurlRequestError) -> bool:
         """Helper to check whether the given error is a proxy-related error."""
         if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):
             return True
 
-        # Once https://github.com/yifeikong/curl_cffi/issues/361 is resolved, do it better.
-        if 'CONNECT tunnel failed' in str(error):  # noqa: SIM103
+        if isinstance(error, CurlProxyError):  # noqa: SIM103
             return True
 
         return False
diff --git a/src/crawlee/memory_storage_client/_request_queue_client.py b/src/crawlee/memory_storage_client/_request_queue_client.py
@@ -504,9 +504,12 @@ async def _delete_request_file_from_storage(self, *, request_id: str, entity_dir
     def _json_to_request(self, request_json: str | None) -> Request | None:
         if request_json is None:
             return None
+
         request_dict = filter_out_none_values_recursively(json.loads(request_json))
+
         if request_dict is None:
             return None
+
         return Request.model_validate(request_dict)
 
     async def _create_internal_request(self, request: Request, forefront: bool | None) -> Request:
@@ -525,7 +528,6 @@ async def _create_internal_request(self, request: Request, forefront: bool | Non
             retry_count=request.retry_count,
             order_no=order_no,
             json_=json_request,
-            user_data={},
         )
 
     def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decimal | None:

diff --git a/src/crawlee/playwright_crawler/_utils.py b/src/crawlee/playwright_crawler/_utils.py
@@ -1,8 +1,12 @@
+from __future__ import annotations
+
 import asyncio
 from contextlib import suppress
+from typing import TYPE_CHECKING
 
-from playwright.async_api import Page
-from playwright.async_api import Request as PlaywrightRequest
+if TYPE_CHECKING:
+    from playwright.async_api import Page
+    from playwright.async_api import Request as PlaywrightRequest
 
 
 async def infinite_scroll(page: Page) -> None:

diff --git a/tests/unit/_utils/test_requests.py b/tests/unit/_utils/test_requests.py
@@ -1,9 +1,14 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import pytest
 
 from crawlee._utils.requests import compute_unique_key, normalize_url, unique_key_to_request_id
 
+if TYPE_CHECKING:
+    from crawlee._types import HttpMethod, HttpPayload
+
 
 def test_unique_key_to_request_id_length() -> None:
     unique_key = 'exampleKey123'
@@ -101,8 +106,8 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo
 )
 def test_compute_unique_key(
     url: str,
-    method: str,
-    payload: str | None,
+    method: HttpMethod,
+    payload: HttpPayload | None,
     *,
     keep_url_fragment: bool,
     use_extended_unique_key: bool,

diff --git a/tests/unit/fingerprint_suite/test_header_generator.py b/tests/unit/fingerprint_suite/test_header_generator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from crawlee.fingerprint_suite import HeaderGenerator
 
 

diff --git a/tests/unit/proxy_configuration/test_tiers.py b/tests/unit/proxy_configuration/test_tiers.py
@@ -44,7 +44,7 @@ async def test_retrying_request_makes_tier_go_up() -> None:
     config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
 
     # Calling `new_proxy_info` with the same request most probably means it's being retried
-    request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1', user_data={})
+    request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1')
 
     info = await config.new_proxy_info(None, request_1, None)
     assert info is not None
@@ -59,7 +59,7 @@ async def test_retrying_request_makes_tier_go_up() -> None:
     assert info.url == tiered_proxy_urls[2][0]
 
     # Subsequent requests with the same domain should use the same tier
-    request_2 = Request(url='http://some.domain/xyz', unique_key='2', id='2', user_data={})
+    request_2 = Request(url='http://some.domain/xyz', unique_key='2', id='2')
 
     info = await config.new_proxy_info(None, request_2, None)
     assert info is not None
@@ -80,7 +80,7 @@ async def test_successful_request_makes_tier_go_down() -> None:
 
     config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
 
-    request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1', user_data={})
+    request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1')
 
     info = None
     for tier in tiered_proxy_urls:
@@ -89,7 +89,7 @@ async def test_successful_request_makes_tier_go_down() -> None:
         assert info.url == tier[0]
 
     for i in range(100):
-        new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i), id=str(i), user_data={})
+        new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i), id=str(i))
         info = await config.new_proxy_info('session_id', new_request, None)
 
     assert info is not None