diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 2a9acf13..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal = 1 diff --git a/setup.py b/setup.py index e0e8fe4d..9ebff734 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ include_package_data=True, zip_safe=False, platforms=["Any"], + python_requires=">=3.6", classifiers=[ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: BSD License", diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 33d7f110..dfda2032 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -149,7 +149,7 @@ def _assert_encoding(self, content_type, body, expected_encoding, expected_unico else: self.assertTrue( body_unicode in expected_unicode, - "%s is not in %s" % (body_unicode, expected_unicode), + f"{body_unicode} is not in {expected_unicode}", ) def test_content_type_and_conversion(self): diff --git a/tests/test_html.py b/tests/test_html.py index f6ca90d2..d61a15ca 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -124,7 +124,7 @@ def test_missing_semicolon(self): ): self.assertEqual(replace_entities(entity, encoding="cp1252"), result) self.assertEqual( - replace_entities("x%sy" % entity, encoding="cp1252"), "x%sy" % result + replace_entities(f"x{entity}y", encoding="cp1252"), f"x{result}y" ) def test_encoding(self): diff --git a/tests/test_url.py b/tests/test_url.py index fe9ee999..f721bd62 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -266,12 +266,8 @@ def test_safe_url_idna_encoding_failure(self): # DNS label too long self.assertEqual( - safe_url_string( - "http://www.{label}.com/résumé?q=résumé".format(label="example" * 11) - ), - "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( - label="example" * 11 - ), + safe_url_string(f"http://www.{'example' * 11}.com/résumé?q=résumé"), + f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", ) def test_safe_url_port_number(self): @@ -971,12 +967,8 @@ def test_canonicalize_url_idna_exceptions(self): # DNS label too long self.assertEqual( - canonicalize_url( - "http://www.{label}.com/résumé?q=résumé".format(label="example" * 11) - ), - "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( - label="example" * 11 - ), + canonicalize_url(f"http://www.{'example' * 11}.com/résumé?q=résumé"), + f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", ) def test_preserve_nonfragment_hash(self): @@ -1033,7 +1025,7 @@ def test_bytes_uri(self): def test_unicode_uri(self): result = parse_data_uri("data:,é") - self.assertEqual(result.data, "é".encode("utf-8")) + self.assertEqual(result.data, "é".encode()) def test_default_mediatype(self): result = parse_data_uri("data:;charset=iso-8859-7,%be%d3%be") diff --git a/tox.ini b/tox.ini index a7c9ad2b..3e76fbdd 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, pypy, py35, py36, py37, py38, py39, py310, pypy3, docs, security, flake8, pylint, black +envlist = py36, py37, py38, py39, py310, pypy3, docs, security, flake8, pylint, black [testenv] deps = diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 32252105..86b678be 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -2,10 +2,9 @@ Functions for handling encoding of web pages """ import re, codecs, encodings -from sys import version_info from typing import Callable, Match, Optional, Tuple, Union, cast from w3lib._types import AnyUnicodeError, StrOrBytes -from w3lib.util import to_native_str +import w3lib.util _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I) @@ -46,6 +45,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: _XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P[\w-]+)") # check for meta tags, or xml decl. and stop search if a body tag is encountered +# pylint: disable=consider-using-f-string _BODY_ENCODING_PATTERN = ( r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)" % (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) @@ -93,7 +93,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]: or match.group("xmlcharset") ) if encoding: - return resolve_encoding(to_native_str(encoding)) + return resolve_encoding(w3lib.util.to_unicode(encoding)) return None @@ -163,7 +163,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]: (codecs.BOM_UTF16_LE, "utf-16-le"), (codecs.BOM_UTF8, "utf-8"), ] -_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) +_FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE} def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]: @@ -208,9 +208,7 @@ def to_unicode(data_str: bytes, encoding: str) -> str: Characters that cannot be converted will be converted to ``\\ufffd`` (the unicode replacement character). """ - return data_str.decode( - encoding, "replace" if version_info[0:2] >= (3, 3) else "w3lib_replace" - ) + return data_str.decode(encoding, "replace") def html_to_unicode( diff --git a/w3lib/html.py b/w3lib/html.py index 634d90f5..8c5c32de 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -228,9 +228,7 @@ def remove_tags_with_content( utext = to_unicode(text, encoding) if which_ones: - tags = "|".join( - [r"<%s\b.*?|<%s\s*/>" % (tag, tag, tag) for tag in which_ones] - ) + tags = "|".join([fr"<{tag}\b.*?|<{tag}\s*/>" for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) utext = retags.sub("", utext) return utext diff --git a/w3lib/http.py b/w3lib/http.py index 4ea31fad..e14e4345 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,6 +1,6 @@ from base64 import urlsafe_b64encode from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping -from w3lib.util import to_bytes, to_native_str +from w3lib.util import to_bytes, to_unicode HeadersDictInput = Mapping[bytes, Union[Any, Sequence]] HeadersDictOutput = MutableMapping[bytes, List[bytes]] @@ -97,7 +97,7 @@ def basic_auth_header( """ - auth = "%s:%s" % (to_native_str(username), to_native_str(password)) + auth = f"{to_unicode(username)}:{to_unicode(password)}" # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html diff --git a/w3lib/url.py b/w3lib/url.py index 71398516..0592a8bf 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -319,7 +319,7 @@ def path_to_file_uri(path: str) -> str: x = pathname2url(os.path.abspath(path)) if os.name == "nt": x = x.replace("|", ":") # http://bugs.python.org/issue5861 - return "file:///%s" % x.lstrip("/") + return f"file:///{x.lstrip('/')}" def file_uri_to_path(uri: str) -> str: @@ -344,6 +344,7 @@ def any_to_uri(uri_or_path: str) -> str: _char = set(map(chr, range(127))) # RFC 2045 token. +# pylint: disable=consider-using-f-string _token = r"[{}]+".format( re.escape( "".join( @@ -359,6 +360,7 @@ def any_to_uri(uri_or_path: str) -> str: ) # RFC 822 quoted-string, without surrounding quotation marks. +# pylint: disable=consider-using-f-string _quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) )