Skip to content

Commit

Permalink
fix: use strip in headers normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Oct 23, 2024
1 parent ee3d14b commit 060ed1e
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 98 deletions.
4 changes: 2 additions & 2 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@


def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
"""Converts all header keys to lowercase and returns them sorted by key."""
normalized_headers = {k.lower(): v for k, v in headers.items()}
"""Converts all header keys to lowercase, strips whitespace, and returns them sorted by key."""
normalized_headers = {k.lower().strip(): v.strip() for k, v in headers.items()}
sorted_headers = sorted(normalized_headers.items())
return dict(sorted_headers)

Expand Down
10 changes: 5 additions & 5 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ def compute_unique_key(
Args:
url: The request URL.
method: The HTTP method, defaults to 'GET'.
headers: The HTTP headers, defaults to None.
payload: The data to be sent as the request body, defaults to None.
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
method: The HTTP method.
headers: The HTTP headers.
payload: The data to be sent as the request body.
keep_url_fragment: A flag indicating whether to keep the URL fragment.
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key.
Returns:
A string representing the unique key for the request.
Expand Down
183 changes: 92 additions & 91 deletions tests/unit/_utils/test_requests.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from crawlee._types import HttpHeaders
from crawlee._utils.requests import compute_unique_key, normalize_url, unique_key_to_request_id

if TYPE_CHECKING:
from crawlee._types import HttpMethod, HttpPayload


def test_unique_key_to_request_id_length() -> None:
unique_key = 'exampleKey123'
Expand Down Expand Up @@ -41,7 +36,7 @@ def test_unique_key_to_request_id_consistency() -> None:
'url_unsafe_characters',
],
)
def test_unique_key_to_request_id_known_values(unique_key: str, expected_request_id: str) -> None:
def test_unique_key_to_request_id_matches_known_values(unique_key: str, expected_request_id: str) -> None:
request_id = unique_key_to_request_id(unique_key)
assert request_id == expected_request_id, f'Unique key "{unique_key}" should produce the expected request ID.'

Expand Down Expand Up @@ -78,93 +73,99 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo
assert output == expected_output


@pytest.mark.parametrize(
('url', 'method', 'headers', 'payload', 'keep_url_fragment', 'use_extended_unique_key', 'expected_output'),
[
('http://example.com', 'GET', None, None, False, False, 'http://example.com'),
('http://example.com', 'POST', None, None, False, False, 'http://example.com'),
('http://example.com', 'GET', None, 'data', False, False, 'http://example.com'),
(
'http://example.com',
'GET',
None,
'data',
False,
True,
'GET|e3b0c442|3a6eb079|http://example.com',
),
(
'http://example.com',
'POST',
HttpHeaders({'Content-Type': 'application/json'}),
'data',
False,
True,
'POST|60d83e70|3a6eb079|http://example.com',
),
(
'http://example.com',
'POST',
HttpHeaders({'Content-Type': 'application/json', 'Custom-Header': 'should be ignored'}),
'data',
False,
True,
'POST|60d83e70|3a6eb079|http://example.com',
),
('http://example.com#fragment', 'GET', None, None, True, False, 'http://example.com#fragment'),
('http://example.com#fragment', 'GET', None, None, False, False, 'http://example.com'),
(
'http://example.com',
'DELETE',
None,
'test',
False,
True,
'DELETE|e3b0c442|9f86d081|http://example.com',
),
('https://example.com?utm_content=test', 'GET', None, None, False, False, 'https://example.com'),
('https://example.com?utm_content=test', 'GET', None, None, True, False, 'https://example.com'),
(
'http://example.com',
'GET',
HttpHeaders({'Accept': 'text/html'}),
None,
False,
True,
'GET|f1614162|e3b0c442|http://example.com',
),
],
ids=[
'simple_get',
'simple_post',
'get_with_payload',
'get_with_payload_extended',
'post_with_payload_extended',
'post_with_payload_and_headers',
'get_with_fragment',
'get_remove_fragment',
'delete_with_payload_extended',
'get_remove_utm',
'get_keep_utm_fragment',
'get_with_headers_extended',
],
)
def test_compute_unique_key(
url: str,
method: HttpMethod,
headers: HttpHeaders | None,
payload: HttpPayload | None,
*,
keep_url_fragment: bool,
use_extended_unique_key: bool,
expected_output: str,
) -> None:
output = compute_unique_key(
def test_compute_unique_key_basic() -> None:
url = 'https://crawlee.dev'
uk_get = compute_unique_key(url, method='GET')
uk_post = compute_unique_key(url, method='POST')
assert url == uk_get == uk_post


def test_compute_unique_key_handles_fragments() -> None:
url = 'https://crawlee.dev#fragment'
uk_with_fragment = compute_unique_key(url, keep_url_fragment=True)
assert uk_with_fragment == url

uk_without_fragment = compute_unique_key(url, 'GET', keep_url_fragment=False)
assert uk_without_fragment == 'https://crawlee.dev'


def test_compute_unique_key_handles_payload() -> None:
url = 'https://crawlee.dev'
payload = '{"key": "value"}'

# Payload without extended unique key
uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=False)
assert uk == url

# Extended unique key and payload is None
uk = compute_unique_key(url, method='POST', payload=None, use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|e3b0c442|https://crawlee.dev'

# Extended unique key and payload is string
uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'

# Extended unique key and payload is bytes
uk = compute_unique_key(url, method='POST', payload=payload.encode(), use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'


def test_compute_unique_key_handles_headers() -> None:
url = 'https://crawlee.dev'
headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})
uk = compute_unique_key(url, headers=headers, use_extended_unique_key=False)
assert uk == url

extended_uk_expected = 'GET|4e1a2cf6|e3b0c442|https://crawlee.dev'

uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
assert uk == extended_uk_expected

# Accept-Encoding header should not be included.
headers = HttpHeaders({'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/json'})
uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
assert uk == extended_uk_expected


def test_compute_unique_key_complex() -> None:
url = 'https://crawlee.dev'
headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})
payload = b'{"key": "value"}'

uk = compute_unique_key(
url,
method='POST',
headers=headers,
payload=payload,
use_extended_unique_key=False,
)
assert uk == url

extended_uk = compute_unique_key(
url,
method=method,
method='POST',
headers=headers,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
use_extended_unique_key=True,
)
assert extended_uk == 'POST|4e1a2cf6|9724c1e2|https://crawlee.dev'


def test_compute_unique_key_post_with_none_payload() -> None:
url = 'https://crawlee.dev'
expected_output = 'POST|e3b0c442|e3b0c442|https://crawlee.dev'
output = compute_unique_key(url, 'POST', payload=None, use_extended_unique_key=True)
assert output == expected_output


def test_compute_unique_key_with_whitespace_in_headers() -> None:
url = 'https://crawlee.dev'
headers = HttpHeaders({'Content-Type': 'application/json'})
headers_with_whitespaces = HttpHeaders({'Content-Type': ' application/json '})

expected_output = 'GET|60d83e70|e3b0c442|https://crawlee.dev'
uk_1 = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
assert uk_1 == expected_output

uk_2 = compute_unique_key(url, headers=headers_with_whitespaces, use_extended_unique_key=True)
assert uk_2 == expected_output

0 comments on commit 060ed1e

Please sign in to comment.