Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove chardet/charset-normalizer. #7589

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES/7561.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Replace automatic character set detection with a `fallback_charset_resolver` parameter
in `ClientSession` to allow user-supplied character set detection functions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ Jesus Cea
Jian Zeng
Jinkyu Yi
Joel Watts
John Parton
Jon Nabozny
Jonas Krüger Svensson
Jonas Obrist
Expand Down
26 changes: 26 additions & 0 deletions aiohttp/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
from .tracing import Trace, TraceConfig
from .typedefs import Final, JSONEncoder, LooseCookies, LooseHeaders, StrOrURL

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]

__all__ = (
# client_exceptions
"ClientConnectionError",
Expand Down Expand Up @@ -159,6 +164,22 @@ class ClientTimeout:
DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)

_RetType = TypeVar("_RetType")
_CharsetResolver = Callable[[ClientResponse, bytes], str]


def _default_fallback_charset_resolver(response: ClientResponse, body: bytes) -> str:

ret: str = chardet.detect(body)["encoding"] or "utf-8"

if ret != "utf-8":
warnings.warn(
"Automatic character set detection is deprecated, use "
"fallback_charset_resolver instead.",
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved
DeprecationWarning,
stacklevel=2,
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved
)

return ret


class ClientSession:
Expand Down Expand Up @@ -220,6 +241,9 @@ def __init__(
requote_redirect_url: bool = True,
trace_configs: Optional[List[TraceConfig]] = None,
read_bufsize: int = 2**16,
fallback_charset_resolver: _CharsetResolver = (
_default_fallback_charset_resolver
),
) -> None:
if loop is None:
if connector is not None:
Expand Down Expand Up @@ -313,6 +337,8 @@ def __init__(
for trace_config in self._trace_configs:
trace_config.freeze()

self._resolve_charset = fallback_charset_resolver

def __init_subclass__(cls: Type["ClientSession"]) -> None:
warnings.warn(
"Inheritance class {} from ClientSession "
Expand Down
55 changes: 28 additions & 27 deletions aiohttp/client_reqrep.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import codecs
import contextlib
import functools
import io
import re
Expand All @@ -12,6 +13,7 @@
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -66,11 +68,6 @@
ssl = None # type: ignore[assignment]
SSLContext = object # type: ignore[misc,assignment]

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]


__all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")

Expand Down Expand Up @@ -722,8 +719,8 @@ class ClientResponse(HeadersMixin):
_raw_headers: RawHeaders = None # type: ignore[assignment] # Response raw headers

_connection = None # current connection
_source_traceback = None
# setted up by ClientRequest after ClientResponse object creation
_source_traceback: Optional[traceback.StackSummary] = None
# set up by ClientRequest after ClientResponse object creation
# post-init stage allows to not change ctor signature
_closed = True # to allow __del__ for non-initialized properly response
_released = False
Expand Down Expand Up @@ -760,6 +757,15 @@ def __init__(
self._loop = loop
# store a reference to session #1985
self._session: Optional[ClientSession] = session
# Save reference to _resolve_charset, so that get_encoding() will still
# work after the response has finished reading the body.
if session is None:
# TODO: Fix session=None in tests (see ClientRequest.__init__).
self._resolve_charset: Callable[
["ClientResponse", bytes], str
] = lambda *_: "utf-8"
else:
self._resolve_charset = session._resolve_charset
if loop.get_debug():
self._source_traceback = traceback.extract_stack(sys._getframe(1))

Expand Down Expand Up @@ -1053,27 +1059,22 @@ def get_encoding(self) -> str:

encoding = mimetype.parameters.get("charset")
if encoding:
try:
codecs.lookup(encoding)
except LookupError:
encoding = None
if not encoding:
if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
encoding = "utf-8"
elif self._body is None:
raise RuntimeError(
"Cannot guess the encoding of " "a not yet read body"
)
else:
encoding = chardet.detect(self._body)["encoding"]
if not encoding:
encoding = "utf-8"
with contextlib.suppress(LookupError):
return codecs.lookup(encoding).name

if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
return "utf-8"

if self._body is None:
raise RuntimeError(
"Cannot compute fallback encoding of a not yet read body"
)

return encoding
return self._resolve_charset(self, self._body)

async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
"""Read response payload and decode."""
Expand Down
30 changes: 30 additions & 0 deletions docs/client_advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -640,3 +640,33 @@ are changed so that aiohttp itself can wait on the underlying
connection to close. Please follow issue `#1925
<https://github.com/aio-libs/aiohttp/issues/1925>`_ for the progress
on this.


Character Set Detection
-----------------------

If you encounter a :exc:`UnicodeDecodeError` when using :meth:`ClientResponse.text()`
this may be because the response does not include the charset needed
to decode the body.
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved

If you know the correct encoding for a request, you can simply specify
the encoding as a parameter (e.g. ``resp.text("windows-1252")``).

Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
can be used to introduce charset guessing functionality. When a charset is not found
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved
in the Content-Type header, this function will be called to get the charset encoding. For
example, this can be used with the ``chardetng_py`` library.::

from chardetng_py import detect

def charset_resolver(resp: ClientResponse, body: bytes) -> str:
tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
return detect(body, allow_utf8=True, tld=tld)

ClientSession(fallback_charset_resolver=charset_resolver)

Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::

from charset_normalizer import detect

ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")
50 changes: 22 additions & 28 deletions docs/client_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
read_bufsize=2**16, \
requote_redirect_url=False, \
trust_env=False, \
trace_configs=None)
trace_configs=None, \
fallback_charset_resolver=lambda r, b: "utf-8")
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved

The class for creating client sessions and making requests.

Expand Down Expand Up @@ -200,6 +201,16 @@ The client session supports the context manager protocol for self closing.
disabling. See :ref:`aiohttp-client-tracing-reference` for
more information.

:param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
A :term:`callable` that accepts a :class:`ClientResponse` and the
:class:`bytes` contents, and returns a :class:`str` which will be used as
the encoding parameter to :meth:`bytes.decode()`.

This function will be called when the charset is not known (e.g. not specified in the
Content-Type header). The default function simply defaults to ``utf-8``.
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved

.. versionadded:: 3.8.6

.. attribute:: closed

``True`` if the session has been closed, ``False`` otherwise.
Expand Down Expand Up @@ -1400,12 +1411,8 @@ Response object
Read response's body and return decoded :class:`str` using
specified *encoding* parameter.

If *encoding* is ``None`` content encoding is autocalculated
using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
header is not provided by server.

:term:`cchardet` is used with fallback to :term:`charset-normalizer` if
*cchardet* is not available.
If *encoding* is ``None`` content encoding is determined from the
Content-Type header, or using the ``fallback_charset_resolver`` function.

Close underlying connection if data reading gets an error,
release connection otherwise.
Expand All @@ -1414,10 +1421,9 @@ Response object
``None`` for encoding autodetection
(default).

:return str: decoded *BODY*

:raise LookupError: if the encoding detected by cchardet is
unknown by Python (e.g. VISCII).
:raises: :exc:`UnicodeDecodeError` if decoding fails. See also
:meth:`get_encoding`.
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved

.. note::

Expand All @@ -1430,18 +1436,14 @@ Response object

await resp.text('ISO-8859-1')

.. comethod:: json(*, encoding=None, loads=json.loads, \
.. method:: json(*, encoding=None, loads=json.loads, \
content_type='application/json')
Dreamsorcerer marked this conversation as resolved.
Show resolved Hide resolved

Read response's body as *JSON*, return :class:`dict` using
specified *encoding* and *loader*. If data is not still available
a ``read`` call will be done,
a ``read`` call will be done.

If *encoding* is ``None`` content encoding is autocalculated
using :term:`cchardet` or :term:`charset-normalizer` as fallback if
*cchardet* is not available.

if response's `content-type` does not match `content_type` parameter
If response's `content-type` does not match `content_type` parameter
:exc:`aiohttp.ContentTypeError` get raised.
To disable content type check pass ``None`` value.

Expand Down Expand Up @@ -1473,17 +1475,9 @@ Response object

.. method:: get_encoding()

Automatically detect content encoding using ``charset`` info in
``Content-Type`` HTTP header. If this info is not exists or there
are no appropriate codecs for encoding then :term:`cchardet` /
:term:`charset-normalizer` is used.

Beware that it is not always safe to use the result of this function to
decode a response. Some encodings detected by cchardet are not known by
Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.

:raise RuntimeError: if called before the body has been read,
for :term:`cchardet` usage
Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
If no charset is present or the charset is not understood by Python, the
``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.

.. versionadded:: 3.0

Expand Down
8 changes: 0 additions & 8 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,6 @@ Dependencies
- *charset-normalizer*
- *multidict*
- *yarl*
- *Optional* :term:`cchardet` as faster replacement for
:term:`charset-normalizer`.

Install it explicitly via:

.. code-block:: bash

$ pip install cchardet

- *Optional* :term:`aiodns` for fast DNS resolving. The
library is highly recommended.
Expand Down
45 changes: 10 additions & 35 deletions tests/test_client_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import gc
import sys
from typing import Any
from unittest import mock

import pytest
Expand Down Expand Up @@ -440,7 +441,11 @@ def side_effect(*args, **kwargs):
assert not response.get_encoding.called


async def test_text_detect_encoding(loop, session) -> None:
@pytest.mark.parametrize("content_type", ("text/plain", "text/plain;charset=invalid"))
async def test_text_charset_resolver(
content_type: str, loop: Any, session: Any
) -> None:
session._resolve_charset = lambda r, b: "cp1251"
response = ClientResponse(
"get",
URL("http://def-cl-resp.org"),
Expand All @@ -458,43 +463,15 @@ def side_effect(*args, **kwargs):
fut.set_result('{"тест": "пройден"}'.encode("cp1251"))
return fut

response._headers = {"Content-Type": "text/plain"}
response._headers = {"Content-Type": content_type}
content = response.content = mock.Mock()
content.read.side_effect = side_effect

await response.read()
res = await response.text()
assert res == '{"тест": "пройден"}'
assert response._connection is None


async def test_text_detect_encoding_if_invalid_charset(loop, session) -> None:
response = ClientResponse(
"get",
URL("http://def-cl-resp.org"),
request_info=mock.Mock(),
writer=mock.Mock(),
continue100=None,
timer=TimerNoop(),
traces=[],
loop=loop,
session=session,
)

def side_effect(*args, **kwargs):
fut = loop.create_future()
fut.set_result('{"тест": "пройден"}'.encode("cp1251"))
return fut

response._headers = {"Content-Type": "text/plain;charset=invalid"}
content = response.content = mock.Mock()
content.read.side_effect = side_effect

await response.read()
res = await response.text()
assert res == '{"тест": "пройден"}'
assert response._connection is None
assert response.get_encoding().lower() in ("windows-1251", "maccyrillic")
assert response.get_encoding() == "cp1251"


async def test_get_encoding_body_none(loop, session) -> None:
Expand All @@ -521,7 +498,7 @@ def side_effect(*args, **kwargs):

with pytest.raises(
RuntimeError,
match="^Cannot guess the encoding of a not yet read body$",
match="^Cannot compute fallback encoding of a not yet read body$",
):
response.get_encoding()
assert response.closed
Expand Down Expand Up @@ -742,9 +719,7 @@ def test_get_encoding_unknown(loop, session) -> None:
)

response._headers = {"Content-Type": "application/json"}
with mock.patch("aiohttp.client_reqrep.chardet") as m_chardet:
m_chardet.detect.return_value = {"encoding": None}
assert response.get_encoding() == "utf-8"
assert response.get_encoding() == "utf-8"


def test_raise_for_status_2xx() -> None:
Expand Down
Loading