Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: merge payload and data fields of Request #542

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions docs/examples/code/fill_and_submit_web_form_crawler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import json

from crawlee import Request
from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
Expand All @@ -18,15 +19,17 @@ async def request_handler(context: HttpCrawlingContext) -> None:
request = Request.from_url(
url='https://httpbin.org/post',
method='POST',
data={
'custname': 'John Doe',
'custtel': '1234567890',
'custemail': '[email protected]',
'size': 'large',
'topping': ['bacon', 'cheese', 'mushroom'],
'delivery': '13:00',
'comments': 'Please ring the doorbell upon arrival.',
},
payload=json.dumps(
{
'custname': 'John Doe',
'custtel': '1234567890',
'custemail': '[email protected]',
'size': 'large',
'topping': ['bacon', 'cheese', 'mushroom'],
'delivery': '13:00',
'comments': 'Please ring the doorbell upon arrival.',
}
).encode(),
)

# Run the crawler with the initial list of requests.
Expand Down
22 changes: 13 additions & 9 deletions docs/examples/code/fill_and_submit_web_form_request.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import json

from crawlee import Request

# Prepare a POST request to the form endpoint.
request = Request.from_url(
url='https://httpbin.org/post',
method='POST',
data={
'custname': 'John Doe',
'custtel': '1234567890',
'custemail': '[email protected]',
'size': 'large',
'topping': ['bacon', 'cheese', 'mushroom'],
'delivery': '13:00',
'comments': 'Please ring the doorbell upon arrival.',
},
payload=json.dumps(
{
'custname': 'John Doe',
'custtel': '1234567890',
'custemail': '[email protected]',
'size': 'large',
'topping': ['bacon', 'cheese', 'mushroom'],
'delivery': '13:00',
'comments': 'Please ring the doorbell upon arrival.',
}
).encode(),
)
2 changes: 1 addition & 1 deletion docs/examples/fill_and_submit_web_form.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Now, let's create a POST request with the form fields and their values using the
{RequestExample}
</CodeBlock>

Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `data` parameter is generally a better approach.
Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.

## Implementing the crawler

Expand Down
58 changes: 54 additions & 4 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,17 @@ class BaseRequestData(BaseModel):
method: HttpMethod = 'GET'
"""HTTP request method."""

headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders())] = HttpHeaders()
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
"""HTTP request headers."""

query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably get rid of this one, or make it a property that changes the url field. Unfortunately, Apify API does not accept queryParams.

"""URL query parameters."""

payload: HttpPayload | None = None

data: Annotated[dict[str, Any], Field(default_factory=dict)] = {}
payload: Annotated[
HttpPayload | None,
PlainValidator(lambda value: None if value is None else str(value).encode('utf-8')),
] = None
"""HTTP request payload."""

user_data: Annotated[
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
Expand Down Expand Up @@ -169,6 +171,8 @@ def from_url(
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
Expand All @@ -178,9 +182,13 @@ def from_url(
**kwargs: Any,
) -> Self:
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
headers = headers or HttpHeaders()
query_params = query_params or {}

unique_key = unique_key or compute_unique_key(
url,
method=method,
headers=headers,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
Expand All @@ -193,6 +201,8 @@ def from_url(
unique_key=unique_key,
id=id,
method=method,
headers=headers,
query_params=query_params,
payload=payload,
**kwargs,
)
Expand Down Expand Up @@ -243,6 +253,8 @@ def from_url(
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
Expand All @@ -261,6 +273,8 @@ def from_url(
Args:
url: The URL of the request.
method: The HTTP method of the request.
headers: The HTTP headers of the request.
query_params: The query parameters of the URL.
payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
used for request routing (different requests go to different handlers).
Expand All @@ -274,9 +288,13 @@ def from_url(
computation. This is only relevant when `unique_key` is not provided.
**kwargs: Additional request properties.
"""
headers = headers or HttpHeaders()
query_params = query_params or {}

unique_key = unique_key or compute_unique_key(
url,
method=method,
headers=headers,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
Expand All @@ -289,6 +307,8 @@ def from_url(
unique_key=unique_key,
id=id,
method=method,
headers=headers,
query_params=query_params,
payload=payload,
**kwargs,
)
Expand Down Expand Up @@ -377,6 +397,36 @@ def forefront(self) -> bool:
def forefront(self, new_value: bool) -> None:
self.crawlee_data.forefront = new_value

def __eq__(self, other: object) -> bool:
"""Compare all relevant fields of the `Request` class, excluding deprecated fields `json_` and `order_no`.
TODO: Remove this method once the issue is resolved.
https://github.com/apify/crawlee-python/issues/94
"""
if isinstance(other, Request):
return (
self.url == other.url
and self.unique_key == other.unique_key
and self.method == other.method
and self.headers == other.headers
and self.query_params == other.query_params
and self.payload == other.payload
and self.user_data == other.user_data
and self.retry_count == other.retry_count
and self.no_retry == other.no_retry
and self.loaded_url == other.loaded_url
and self.handled_at == other.handled_at
and self.id == other.id
and self.label == other.label
and self.state == other.state
and self.max_retries == other.max_retries
and self.session_rotation_count == other.session_rotation_count
and self.enqueue_strategy == other.enqueue_strategy
and self.last_proxy_tier == other.last_proxy_tier
and self.forefront == other.forefront
)
return NotImplemented


class RequestWithLock(Request):
"""A crawling request with information about locks."""
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

HttpQueryParams: TypeAlias = dict[str, str]

HttpPayload: TypeAlias = Union[str, bytes]
HttpPayload: TypeAlias = bytes


def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
Expand Down
8 changes: 1 addition & 7 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,7 @@ def compute_unique_key(


def _get_payload_hash(payload: HttpPayload | None) -> str:
if payload is None:
payload_in_bytes = b''
elif isinstance(payload, str):
payload_in_bytes = payload.encode('utf-8')
else:
payload_in_bytes = payload

payload_in_bytes = b'' if payload is None else payload
return compute_short_hash(payload_in_bytes)


Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Protocol
from typing import TYPE_CHECKING, Protocol

from crawlee._utils.http import is_status_code_error
from crawlee.errors import HttpStatusCodeError

if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -115,7 +115,7 @@ async def send_request(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -128,7 +128,7 @@ async def send_request(
method: The HTTP method to use.
headers: The headers to include in the request.
query_params: The query parameters to include in the request.
data: The data to be sent as the request body.
payload: The data to be sent as the request body.
session: The session associated with the request.
proxy_info: The information about the proxy to be used.

Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpMethod, HttpQueryParams
from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics import Statistics
Expand Down Expand Up @@ -132,7 +132,7 @@ async def crawl(
method=request.method,
headers=headers,
params=request.query_params,
data=request.data,
content=request.payload,
cookies=session.cookies if session else None,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)
Expand Down Expand Up @@ -167,7 +167,7 @@ async def send_request(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -179,7 +179,7 @@ async def send_request(
method=method,
headers=dict(headers) if headers else None,
params=query_params,
data=data,
content=payload,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)

Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from curl_cffi.const import CurlHttpVersion
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._types import HttpHeaders, HttpPayload
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee.errors import ProxyError
from crawlee.http_clients import BaseHttpClient, HttpCrawlingResult, HttpResponse
Expand Down Expand Up @@ -153,7 +153,7 @@ async def send_request(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -166,7 +166,7 @@ async def send_request(
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
headers=dict(headers) if headers else None,
params=query_params,
data=data,
data=payload,
cookies=session.cookies if session else None,
allow_redirects=True,
)
Expand Down
22 changes: 12 additions & 10 deletions src/crawlee/memory_storage_client/_request_queue_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ async def add_request(
persist_storage=self._memory_storage_client.persist_storage,
)

# We return wasAlreadyHandled is false even though the request may have been added as handled,
# We return was_already_handled=False even though the request may have been added as handled,
# because that's how API behaves.
return ProcessedRequest(
id=request_model.id,
Expand Down Expand Up @@ -519,15 +519,17 @@ async def _create_internal_request(self, request: Request, forefront: bool | Non
if request.id is not None and request.id != id:
raise ValueError('Request ID does not match its unique_key.')

json_request = await json_dumps({**(request.model_dump()), 'id': id})
request_kwargs = {
**(request.model_dump()),
'id': id,
'order_no': order_no,
}

del request_kwargs['json_']

return Request(
url=request.url,
unique_key=request.unique_key,
id=id,
method=request.method,
retry_count=request.retry_count,
order_no=order_no,
json_=json_request,
**request_kwargs,
json_=await json_dumps(request_kwargs),
)

def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decimal | None:
Expand All @@ -538,7 +540,7 @@ def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decim
timestamp = Decimal(datetime.now(timezone.utc).timestamp()) * 1000
timestamp = round(timestamp, 6)

# Make sure that this timestamp was not used yet, so that we have unique orderNos
# Make sure that this timestamp was not used yet, so that we have unique order_nos
if timestamp <= self._last_used_timestamp:
timestamp = self._last_used_timestamp + Decimal(0.000001)

Expand Down
8 changes: 2 additions & 6 deletions tests/unit/_utils/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_compute_unique_key_handles_fragments() -> None:

def test_compute_unique_key_handles_payload() -> None:
url = 'https://crawlee.dev'
payload = '{"key": "value"}'
payload = b'{"key": "value"}'

# Payload without extended unique key
uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=False)
Expand All @@ -101,12 +101,8 @@ def test_compute_unique_key_handles_payload() -> None:
uk = compute_unique_key(url, method='POST', payload=None, use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|e3b0c442|https://crawlee.dev'

# Extended unique key and payload is string
uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'

# Extended unique key and payload is bytes
uk = compute_unique_key(url, method='POST', payload=payload.encode(), use_extended_unique_key=True)
uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'


Expand Down
Loading
Loading