From 27a5fd8cb8c88537216d7a498eba9d9177951d76 Mon Sep 17 00:00:00 2001 From: Sidney Markowitz Date: Tue, 12 Dec 2023 05:21:18 +1300 Subject: [PATCH] gh-94606: Fix error when message with Unicode surrogate not surrogateescaped string (GH-94641) Co-authored-by: Serhiy Storchaka --- Lib/email/message.py | 29 ++++++++++--------- Lib/email/utils.py | 4 +-- Lib/test/test_email/test_message.py | 29 +++++++++++++++++++ ...2-07-07-05-37-53.gh-issue-94606.hojJ54.rst | 3 ++ 4 files changed, 49 insertions(+), 16 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst diff --git a/Lib/email/message.py b/Lib/email/message.py index 411118c74dabb4..fe769580fed5d0 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -289,25 +289,26 @@ def get_payload(self, i=None, decode=False): # cte might be a Header, so for now stringify it. cte = str(self.get('content-transfer-encoding', '')).lower() # payload may be bytes here. - if isinstance(payload, str): - if utils._has_surrogates(payload): - bpayload = payload.encode('ascii', 'surrogateescape') - if not decode: + if not decode: + if isinstance(payload, str) and utils._has_surrogates(payload): + try: + bpayload = payload.encode('ascii', 'surrogateescape') try: payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace') except LookupError: payload = bpayload.decode('ascii', 'replace') - elif decode: - try: - bpayload = payload.encode('ascii') - except UnicodeError: - # This won't happen for RFC compliant messages (messages - # containing only ASCII code points in the unicode input). - # If it does happen, turn the string into bytes in a way - # guaranteed not to fail. - bpayload = payload.encode('raw-unicode-escape') - if not decode: + except UnicodeEncodeError: + pass return payload + if isinstance(payload, str): + try: + bpayload = payload.encode('ascii', 'surrogateescape') + except UnicodeEncodeError: + # This won't happen for RFC compliant messages (messages + # containing only ASCII code points in the unicode input). + # If it does happen, turn the string into bytes in a way + # guaranteed not to fail. + bpayload = payload.encode('raw-unicode-escape') if cte == 'quoted-printable': return quopri.decodestring(bpayload) elif cte == 'base64': diff --git a/Lib/email/utils.py b/Lib/email/utils.py index a49a8fa986ce0c..9175f2fdb6e69e 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -44,10 +44,10 @@ escapesre = re.compile(r'[\\"]') def _has_surrogates(s): - """Return True if s contains surrogate-escaped binary data.""" + """Return True if s may contain surrogate-escaped binary data.""" # This check is based on the fact that unless there are surrogates, utf8 # (Python's default encoding) can encode any string. This is the fastest - # way to check for surrogates, see issue 11454 for timings. + # way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings. try: s.encode() return False diff --git a/Lib/test/test_email/test_message.py b/Lib/test/test_email/test_message.py index d3f396f02e7a72..034f7626c1fc7c 100644 --- a/Lib/test/test_email/test_message.py +++ b/Lib/test/test_email/test_message.py @@ -748,6 +748,35 @@ def test_iter_attachments_mutation(self): self.assertEqual(len(list(m.iter_attachments())), 2) self.assertEqual(m.get_payload(), orig) + get_payload_surrogate_params = { + + 'good_surrogateescape': ( + "String that can be encod\udcc3\udcabd with surrogateescape", + b'String that can be encod\xc3\xabd with surrogateescape' + ), + + 'string_with_utf8': ( + "String with utf-8 charactër", + b'String with utf-8 charact\xebr' + ), + + 'surrogate_and_utf8': ( + "String that cannot be ëncod\udcc3\udcabd with surrogateescape", + b'String that cannot be \xebncod\\udcc3\\udcabd with surrogateescape' + ), + + 'out_of_range_surrogate': ( + "String with \udfff cannot be encoded with surrogateescape", + b'String with \\udfff cannot be encoded with surrogateescape' + ), + } + + def get_payload_surrogate_as_gh_94606(self, msg, expected): + """test for GH issue 94606""" + m = self._str_msg(msg) + payload = m.get_payload(decode=True) + self.assertEqual(expected, payload) + class TestEmailMessage(TestEmailMessageBase, TestEmailBase): message = EmailMessage diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst b/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst new file mode 100644 index 00000000000000..5201ab7d842088 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-07-07-05-37-53.gh-issue-94606.hojJ54.rst @@ -0,0 +1,3 @@ +Fix UnicodeEncodeError when :func:`email.message.get_payload` reads a message +with a Unicode surrogate character and the message content is not well-formed for +surrogateescape encoding. Patch by Sidney Markowitz.