Skip to content

Commit

Permalink
fix: adopt new version of curl-cffi (#543)
Browse files Browse the repository at this point in the history
### Description

- Version 1.7.2 of `curl-cffi` introduces breaking changes.
- This update includes adopting the new version, adding type aliases for
`Request` fields, and incorporating other minor changes from PR #542.

### Issues

- N/A (ad-hoc fix)

### Testing

- The current set of unit tests should cover the changes.

### Checklist

- [x] CI passed
  • Loading branch information
vdusek authored Sep 25, 2024
1 parent 8a3d369 commit f6fcf48
Show file tree
Hide file tree
Showing 15 changed files with 76 additions and 40 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ apify = { version = ">=2.0.0", optional = true }
beautifulsoup4 = { version = ">=4.12.0", optional = true }
colorama = ">=0.4.0"
cookiecutter = ">=2.6.0"
curl-cffi = { version = ">=0.7.0", optional = true }
curl-cffi = { version = ">=0.7.2", optional = true }
docutils = ">=0.21.0"
eval-type-backport = ">=0.2.0"
html5lib = { version = ">=1.0", optional = true }
Expand Down
19 changes: 11 additions & 8 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
)
from typing_extensions import Self

from crawlee._types import EnqueueStrategy, HttpMethod
from crawlee._types import EnqueueStrategy, HttpMethod, HttpPayload, HttpQueryParams
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.urls import extract_query_params, validate_http_url

Expand Down Expand Up @@ -117,14 +117,17 @@ class BaseRequestData(BaseModel):
"""

method: HttpMethod = 'GET'
"""HTTP request method."""

payload: str | None = None
headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
"""HTTP request headers."""

headers: Annotated[dict[str, str] | None, Field(default_factory=dict)] = None
query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
"""URL query parameters."""

query_params: Annotated[dict[str, Any] | None, Field(default_factory=dict)] = None
payload: HttpPayload | None = None

data: Annotated[dict[str, Any] | None, Field(default_factory=dict)] = None
data: Annotated[dict[str, Any], Field(default_factory=dict)] = {}

user_data: Annotated[
dict[str, JsonValue], # Internally, the model contains `UserData`, this is just for convenience
Expand All @@ -139,7 +142,7 @@ class BaseRequestData(BaseModel):
exclude_defaults=True,
)
),
]
] = {}
"""Custom user data assigned to the request. Use this to save any request related data to the
request's scope, keeping them accessible on retries, failures etc.
"""
Expand All @@ -158,7 +161,7 @@ def from_url(
url: str,
*,
method: HttpMethod = 'GET',
payload: str | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
id: str | None = None,
Expand Down Expand Up @@ -232,7 +235,7 @@ def from_url(
url: str,
*,
method: HttpMethod = 'GET',
payload: str | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
id: str | None = None,
Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from collections.abc import Coroutine, Iterator, Mapping, Sequence
from collections.abc import Mapping
from dataclasses import dataclass, field
from enum import Enum
from typing import TYPE_CHECKING, Any, Literal, Protocol, Union
Expand All @@ -10,6 +10,7 @@
if TYPE_CHECKING:
import logging
import re
from collections.abc import Coroutine, Iterator, Sequence

from crawlee import Glob
from crawlee._request import BaseRequestData, Request
Expand All @@ -26,6 +27,10 @@

HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']

HttpQueryParams: TypeAlias = dict[str, str]

HttpPayload: TypeAlias = Union[str, bytes]


class EnqueueStrategy(str, Enum):
"""Strategy for deciding which links should be followed and which ones should be ignored."""
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/_utils/http.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from __future__ import annotations


def is_status_code_error(value: int) -> bool:
"""Returns `True` for 4xx or 5xx status codes, `False` otherwise."""
return is_status_code_client_error(value) or is_status_code_server_error(value)
Expand Down
12 changes: 8 additions & 4 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
from base64 import b64encode
from hashlib import sha256
from logging import getLogger
from typing import TYPE_CHECKING
from urllib.parse import parse_qsl, urlencode, urlparse

from crawlee._utils.crypto import compute_short_hash

if TYPE_CHECKING:
from crawlee._types import HttpMethod, HttpPayload

logger = getLogger(__name__)


Expand Down Expand Up @@ -82,22 +86,22 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:

def compute_unique_key(
url: str,
method: str = 'GET',
payload: str | bytes | None = None,
method: HttpMethod = 'GET',
payload: HttpPayload | None = None,
*,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
) -> str:
"""Computes a unique key for caching & deduplication of requests.
This function computes a unique key by normalizing the provided URL and method.
If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
If `use_extended_unique_key` is True and a payload is provided, the payload is hashed and
included in the key. Otherwise, the unique key is just the normalized URL.
Args:
url: The request URL.
method: The HTTP method, defaults to 'GET'.
payload: The request payload, defaults to None.
payload: The data to be sent as the request body, defaults to None.
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
Expand Down
2 changes: 2 additions & 0 deletions src/crawlee/fingerprint_suite/_consts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

# ruff: noqa: E501

COMMON_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
Expand Down
8 changes: 6 additions & 2 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING, Protocol
from typing import TYPE_CHECKING, Any, Protocol

from crawlee._utils.http import is_status_code_error
from crawlee.errors import HttpStatusCodeError

if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpHeaders, HttpMethod
from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -114,6 +114,8 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -125,6 +127,8 @@ async def send_request(
url: The URL to send the request to.
method: The HTTP method to use.
headers: The headers to include in the request.
query_params: The query parameters to include in the request.
data: The data to be sent as the request body.
session: The session associated with the request.
proxy_info: The information about the proxy to be used.
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpMethod
from crawlee._types import HttpMethod, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics import Statistics
Expand Down Expand Up @@ -166,7 +166,7 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: dict[str, Any] | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
Expand Down
26 changes: 13 additions & 13 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

try:
from curl_cffi.requests import AsyncSession
from curl_cffi.requests.errors import RequestsError
from curl_cffi.requests.impersonate import BrowserType
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
except ImportError as exc:
raise ImportError(
"To import anything from this subpackage, you need to install the 'curl-impersonate' extra."
Expand All @@ -24,7 +25,7 @@

from curl_cffi.requests import Response

from crawlee._types import HttpHeaders, HttpMethod
from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -116,14 +117,14 @@ async def crawl(
try:
response = await client.request(
url=request.url,
method=request.method.upper(), # curl-cffi requires uppercase method
method=request.method.upper(), # type: ignore # curl-cffi requires uppercase method
headers=request.headers,
params=request.query_params,
data=request.data,
data=request.payload,
cookies=session.cookies if session else None,
allow_redirects=True,
)
except RequestsError as exc:
except CurlRequestError as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
raise
Expand All @@ -150,7 +151,7 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: dict[str, Any] | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
Expand All @@ -161,14 +162,14 @@ async def send_request(
try:
response = await client.request(
url=url,
method=method.upper(), # curl-cffi requires uppercase method
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
headers=headers,
params=query_params,
data=data,
cookies=session.cookies if session else None,
allow_redirects=True,
)
except RequestsError as exc:
except CurlRequestError as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
raise
Expand All @@ -194,7 +195,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
# are set as default options.
kwargs: dict[str, Any] = {
'proxy': proxy_url,
'impersonate': BrowserType.chrome,
'impersonate': CURL_DEFAULT_CHROME,
}

# Update the default kwargs with any additional user-provided kwargs.
Expand All @@ -206,13 +207,12 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
return self._client_by_proxy_url[proxy_url]

@staticmethod
def _is_proxy_error(error: RequestsError) -> bool:
def _is_proxy_error(error: CurlRequestError) -> bool:
"""Helper to check whether the given error is a proxy-related error."""
if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):
return True

# Once https://github.com/yifeikong/curl_cffi/issues/361 is resolved, do it better.
if 'CONNECT tunnel failed' in str(error): # noqa: SIM103
if isinstance(error, CurlProxyError): # noqa: SIM103
return True

return False
4 changes: 3 additions & 1 deletion src/crawlee/memory_storage_client/_request_queue_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,9 +504,12 @@ async def _delete_request_file_from_storage(self, *, request_id: str, entity_dir
def _json_to_request(self, request_json: str | None) -> Request | None:
if request_json is None:
return None

request_dict = filter_out_none_values_recursively(json.loads(request_json))

if request_dict is None:
return None

return Request.model_validate(request_dict)

async def _create_internal_request(self, request: Request, forefront: bool | None) -> Request:
Expand All @@ -525,7 +528,6 @@ async def _create_internal_request(self, request: Request, forefront: bool | Non
retry_count=request.retry_count,
order_no=order_no,
json_=json_request,
user_data={},
)

def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decimal | None:
Expand Down
8 changes: 6 additions & 2 deletions src/crawlee/playwright_crawler/_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from __future__ import annotations

import asyncio
from contextlib import suppress
from typing import TYPE_CHECKING

from playwright.async_api import Page
from playwright.async_api import Request as PlaywrightRequest
if TYPE_CHECKING:
from playwright.async_api import Page
from playwright.async_api import Request as PlaywrightRequest


async def infinite_scroll(page: Page) -> None:
Expand Down
9 changes: 7 additions & 2 deletions tests/unit/_utils/test_requests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from crawlee._utils.requests import compute_unique_key, normalize_url, unique_key_to_request_id

if TYPE_CHECKING:
from crawlee._types import HttpMethod, HttpPayload


def test_unique_key_to_request_id_length() -> None:
unique_key = 'exampleKey123'
Expand Down Expand Up @@ -101,8 +106,8 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo
)
def test_compute_unique_key(
url: str,
method: str,
payload: str | None,
method: HttpMethod,
payload: HttpPayload | None,
*,
keep_url_fragment: bool,
use_extended_unique_key: bool,
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/fingerprint_suite/test_header_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from crawlee.fingerprint_suite import HeaderGenerator


Expand Down
8 changes: 4 additions & 4 deletions tests/unit/proxy_configuration/test_tiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ async def test_retrying_request_makes_tier_go_up() -> None:
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

# Calling `new_proxy_info` with the same request most probably means it's being retried
request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1', user_data={})
request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1')

info = await config.new_proxy_info(None, request_1, None)
assert info is not None
Expand All @@ -59,7 +59,7 @@ async def test_retrying_request_makes_tier_go_up() -> None:
assert info.url == tiered_proxy_urls[2][0]

# Subsequent requests with the same domain should use the same tier
request_2 = Request(url='http://some.domain/xyz', unique_key='2', id='2', user_data={})
request_2 = Request(url='http://some.domain/xyz', unique_key='2', id='2')

info = await config.new_proxy_info(None, request_2, None)
assert info is not None
Expand All @@ -80,7 +80,7 @@ async def test_successful_request_makes_tier_go_down() -> None:

config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)

request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1', user_data={})
request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1')

info = None
for tier in tiered_proxy_urls:
Expand All @@ -89,7 +89,7 @@ async def test_successful_request_makes_tier_go_down() -> None:
assert info.url == tier[0]

for i in range(100):
new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i), id=str(i), user_data={})
new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i), id=str(i))
info = await config.new_proxy_info('session_id', new_request, None)

assert info is not None
Expand Down
Loading

0 comments on commit f6fcf48

Please sign in to comment.