Skip to content

Commit

Permalink
feat!: Add headers in unique key computation (#609)
Browse files Browse the repository at this point in the history
### Description

<!-- The purpose of the PR, list of the changes, ... -->

- Adds headers in unique key computation

### Issues

<!-- If applicable, reference any related GitHub issues -->

- Closes: #548 

### Testing

<!-- Describe the testing process for these changes -->

- Added tests in [tests/unit/_utils/test_requests.py]

### Checklist

- [x] CI passed

---------

Co-authored-by: Vlada Dusek <[email protected]>
  • Loading branch information
Prathamesh010 and vdusek authored Oct 23, 2024
1 parent c43eda2 commit 6c4746f
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 30 deletions.
58 changes: 41 additions & 17 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from crawlee._utils.crypto import compute_short_hash

if TYPE_CHECKING:
from crawlee._types import HttpMethod, HttpPayload
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload

logger = getLogger(__name__)

Expand Down Expand Up @@ -87,55 +87,79 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
def compute_unique_key(
url: str,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
payload: HttpPayload | None = None,
*,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
) -> str:
"""Computes a unique key for caching & deduplication of requests.
"""Compute a unique key for caching & deduplication of requests.
This function computes a unique key by normalizing the provided URL and method.
If `use_extended_unique_key` is True and a payload is provided, the payload is hashed and
included in the key. Otherwise, the unique key is just the normalized URL.
This function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`
is True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key
is just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed
and included in the key.
Args:
url: The request URL.
method: The HTTP method, defaults to 'GET'.
headers: The HTTP headers, defaults to None.
payload: The data to be sent as the request body, defaults to None.
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
Returns:
A string representing the unique key for the request.
"""
# Normalize the URL and method.
# Normalize the URL.
try:
normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment)
except Exception as exc:
logger.warning(f'Failed to normalize URL: {exc}')
normalized_url = url

# Normalize the method.
normalized_method = method.upper()

# Compute and return the extended unique key if required.
if use_extended_unique_key:
if payload is None:
payload_in_bytes = b''
elif isinstance(payload, str):
payload_in_bytes = payload.encode('utf-8')
else:
payload_in_bytes = payload
payload_hash = _get_payload_hash(payload)
headers_hash = _get_headers_hash(headers)

payload_hash = compute_short_hash(payload_in_bytes)
return f'{normalized_method}({payload_hash}):{normalized_url}'
# Return the extended unique key. Use pipe as a separator of the different parts of the unique key.
return f'{normalized_method}|{headers_hash}|{payload_hash}|{normalized_url}'

# Log information if there is a non-GET request with a payload.
if normalized_method != 'GET' and payload:
logger.info(
f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know '
'that if your requests point to the same URL and differ only in method and payload, you should consider '
'using the "use_extended_unique_key" option.'
f'{normalized_method} request with a payload detected. By default, requests to the same URL with '
'different methods or payloads will be deduplicated. Use "use_extended_unique_key" to include payload '
'and headers in the unique key and avoid deduplication in these cases.'
)

# Return the normalized URL as the unique key.
return normalized_url


def _get_payload_hash(payload: HttpPayload | None) -> str:
if payload is None:
payload_in_bytes = b''
elif isinstance(payload, str):
payload_in_bytes = payload.encode('utf-8')
else:
payload_in_bytes = payload

return compute_short_hash(payload_in_bytes)


def _get_headers_hash(headers: HttpHeaders | None) -> str:
# HTTP headers which will be included in the hash computation.
whitelisted_headers = {'accept', 'accept-language', 'authorization', 'content-type'}

if headers is None:
normalized_headers = b''
else:
filtered_headers = {key: value for key, value in headers.items() if key in whitelisted_headers}
normalized_headers = '|'.join(f'{k}:{v}' for k, v in filtered_headers.items()).encode('utf-8')

return compute_short_hash(normalized_headers)
73 changes: 60 additions & 13 deletions tests/unit/_utils/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pytest

from crawlee._types import HttpHeaders
from crawlee._utils.requests import compute_unique_key, normalize_url, unique_key_to_request_id

if TYPE_CHECKING:
Expand Down Expand Up @@ -78,35 +79,80 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo


@pytest.mark.parametrize(
('url', 'method', 'payload', 'keep_url_fragment', 'use_extended_unique_key', 'expected_output'),
('url', 'method', 'headers', 'payload', 'keep_url_fragment', 'use_extended_unique_key', 'expected_output'),
[
('http://example.com', 'GET', None, False, False, 'http://example.com'),
('http://example.com', 'POST', None, False, False, 'http://example.com'),
('http://example.com', 'GET', 'data', False, False, 'http://example.com'),
('http://example.com', 'GET', 'data', False, True, 'GET(3a6eb079):http://example.com'),
('http://example.com', 'POST', 'data', False, True, 'POST(3a6eb079):http://example.com'),
('http://example.com#fragment', 'GET', None, True, False, 'http://example.com#fragment'),
('http://example.com#fragment', 'GET', None, False, False, 'http://example.com'),
('http://example.com', 'DELETE', 'test', False, True, 'DELETE(9f86d081):http://example.com'),
('https://example.com?utm_content=test', 'GET', None, False, False, 'https://example.com'),
('https://example.com?utm_content=test', 'GET', None, True, False, 'https://example.com'),
('http://example.com', 'GET', None, None, False, False, 'http://example.com'),
('http://example.com', 'POST', None, None, False, False, 'http://example.com'),
('http://example.com', 'GET', None, 'data', False, False, 'http://example.com'),
(
'http://example.com',
'GET',
None,
'data',
False,
True,
'GET|e3b0c442|3a6eb079|http://example.com',
),
(
'http://example.com',
'POST',
HttpHeaders({'Content-Type': 'application/json'}),
'data',
False,
True,
'POST|60d83e70|3a6eb079|http://example.com',
),
(
'http://example.com',
'POST',
HttpHeaders({'Content-Type': 'application/json', 'Custom-Header': 'should be ignored'}),
'data',
False,
True,
'POST|60d83e70|3a6eb079|http://example.com',
),
('http://example.com#fragment', 'GET', None, None, True, False, 'http://example.com#fragment'),
('http://example.com#fragment', 'GET', None, None, False, False, 'http://example.com'),
(
'http://example.com',
'DELETE',
None,
'test',
False,
True,
'DELETE|e3b0c442|9f86d081|http://example.com',
),
('https://example.com?utm_content=test', 'GET', None, None, False, False, 'https://example.com'),
('https://example.com?utm_content=test', 'GET', None, None, True, False, 'https://example.com'),
(
'http://example.com',
'GET',
HttpHeaders({'Accept': 'text/html'}),
None,
False,
True,
'GET|f1614162|e3b0c442|http://example.com',
),
],
ids=[
'simple_get',
'simple_post',
'get_with_payload',
'get_with_payload_extended',
'post_with_payload_extended',
'post_with_payload_and_headers',
'get_with_fragment',
'get_remove_fragment',
'delete_with_payload_extended',
'get_remove_utm',
'get_keep_utm_fragment',
'get_with_headers_extended',
],
)
def test_compute_unique_key(
url: str,
method: HttpMethod,
headers: HttpHeaders | None,
payload: HttpPayload | None,
*,
keep_url_fragment: bool,
Expand All @@ -115,8 +161,9 @@ def test_compute_unique_key(
) -> None:
output = compute_unique_key(
url,
method,
payload,
method=method,
headers=headers,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
)
Expand Down

0 comments on commit 6c4746f

Please sign in to comment.