diff --git a/tests/test_url.py b/tests/test_url.py index 875a7ade..0b401224 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -849,6 +849,10 @@ def test_url_query_parameter(self): self.assertEqual( url_query_parameter("product.html?id=", "id", keep_blank_values=1), "" ) + self.assertEqual( + url_query_parameter("product.html?id=200;foo=bar", "id", separator=';'), + '200', + ) def test_url_query_parameter_2(self): """ @@ -958,6 +962,14 @@ def test_add_or_replace_parameter_fail(self): "http://domain/test?arg1=v3&arg2=v2", ) + @pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164") + def test_add_or_replace_parameter_semicolon(self): + url = 'http://domain/test?arg1=v1;arg2=v2;arg3=v3' + self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4', separator=';'), + 'http://domain/test?arg1=v1;arg2=v2;arg3=v3;arg4=v4') + self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3', separator=';'), + 'http://domain/test?arg1=v1;arg2=v2;arg3=nv3') + def test_add_or_replace_parameters(self): url = "http://domain/test" self.assertEqual( @@ -1157,6 +1169,11 @@ def test_typical_usage(self): "http://www.example.com/do?a=1", ) + @pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164") + def test_typical_usage_semicolon(self): + self.assertEqual(canonicalize_url("http://www.example.com/do?c=1;b=2;a=3", query_separator=';'), + "http://www.example.com/do?a=3;b=2;c=1") + def test_port_number(self): self.assertEqual( canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"), diff --git a/w3lib/url.py b/w3lib/url.py index 1d2658f1..7b9dce7a 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -8,6 +8,7 @@ import posixpath import re import string +from inspect import getfullargspec from typing import ( cast, Callable, @@ -20,8 +21,8 @@ Union, ) from urllib.parse import ( - parse_qs, - parse_qsl, + parse_qs as _parse_qs, + parse_qsl as _parse_qsl, ParseResult, quote, unquote_to_bytes, @@ -41,6 +42,23 @@ from ._url import _SPECIAL_SCHEMES +_REMOVE_SEPARATOR = 'separator' not in getfullargspec(_parse_qs)[0] + + +def _handle_separator(func, *args, **kwargs): + if _REMOVE_SEPARATOR: + kwargs.pop('separator', None) + return func(*args, **kwargs) + + +def parse_qs(*args, **kwargs): + return _handle_separator(_parse_qs, *args, **kwargs) + + +def parse_qsl(*args, **kwargs): + return _handle_separator(_parse_qsl, *args, **kwargs) + + # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error: UnicodeError) -> Tuple[str, int]: error = cast(AnyUnicodeError, error) @@ -200,6 +218,8 @@ def url_query_parameter( parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int] = 0, + *, + separator: str = '&', ) -> Optional[str]: """Return the value of a url parameter, given the url and parameter name @@ -230,7 +250,9 @@ def url_query_parameter( """ queryparams = parse_qs( - urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values) + urlsplit(str(url))[3], + keep_blank_values=bool(keep_blank_values), + separator=separator, ) if parameter in queryparams: return queryparams[parameter][0] @@ -305,9 +327,13 @@ def url_query_cleaner( return url -def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: +def _add_or_replace_parameters(url: str, params: Dict[str, str], *, separator: str = '&') -> str: parsed = urlsplit(url) - current_args = parse_qsl(parsed.query, keep_blank_values=True) + current_args = parse_qsl( + parsed.query, + keep_blank_values=True, + separator=separator, + ) new_args = [] seen_params = set() @@ -327,7 +353,7 @@ def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: return urlunsplit(parsed._replace(query=query)) -def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: +def add_or_replace_parameter(url: str, name: str, new_value: str, *, separator: str = '&') -> str: """Add or remove a parameter to a given url >>> import w3lib.url @@ -340,10 +366,10 @@ def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: >>> """ - return _add_or_replace_parameters(url, {name: new_value}) + return _add_or_replace_parameters(url, {name: new_value}, separator=separator) -def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str: +def add_or_replace_parameters(url: str, new_parameters: Dict[str, str], *, separator: str = '&') -> str: """Add or remove a parameters to a given url >>> import w3lib.url @@ -355,7 +381,7 @@ def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str: >>> """ - return _add_or_replace_parameters(url, new_parameters) + return _add_or_replace_parameters(url, new_parameters, separator=separator) def path_to_file_uri(path: str) -> str: @@ -528,6 +554,8 @@ def canonicalize_url( keep_blank_values: bool = True, keep_fragments: bool = False, encoding: Optional[str] = None, + *, + query_separator: str = '&' ) -> str: r"""Canonicalize the given url by applying the following procedures: @@ -600,7 +628,11 @@ def canonicalize_url( # Similar considerations apply to query parts. The functionality of # IRIs (namely, to be able to include non-ASCII characters) can only be # used if the query part is encoded in UTF-8. - keyvals = parse_qsl_to_bytes(query, keep_blank_values) + keyvals = parse_qsl_to_bytes( + query, + keep_blank_values, + separator=query_separator, + ) keyvals.sort() query = urlencode(keyvals) @@ -642,7 +674,10 @@ def parse_url( def parse_qsl_to_bytes( - qs: str, keep_blank_values: bool = False + qs: str, + keep_blank_values: bool = False, + *, + separator: str = '&', ) -> List[Tuple[bytes, bytes]]: """Parse a query given as a string argument. @@ -665,7 +700,7 @@ def parse_qsl_to_bytes( # with unquote_to_bytes(s) coerce_args = cast(Callable[..., Tuple[str, Callable[..., bytes]]], _coerce_args) qs, _coerce_result = coerce_args(qs) - pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] + pairs = qs.split(separator) r = [] for name_value in pairs: if not name_value: