Merge pull request #533 from jawah/preemptive-patch

fix html file is not reported as UTF8 after conversion
jawah · Sep 28, 2024 · 957bd6a · 957bd6a
2 parents 39b6f5c + bf920e1
commit 957bd6a
Show file tree

Hide file tree

Showing 7 changed files with 136 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??)
 
+### Added
+- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
+
 ### Fixed
 - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
 - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407)
+- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
 
 ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
 

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -371,7 +371,13 @@ def from_bytes(
                 and not lazy_str_hard_failure
             ):
                 fallback_entry = CharsetMatch(
-                    sequences, encoding_iana, threshold, False, [], decoded_payload
+                    sequences,
+                    encoding_iana,
+                    threshold,
+                    False,
+                    [],
+                    decoded_payload,
+                    preemptive_declaration=specified_encoding,
                 )
                 if encoding_iana == specified_encoding:
                     fallback_specified = fallback_entry
@@ -433,6 +439,7 @@ def from_bytes(
                 bom_or_sig_available,
                 cd_ratios_merged,
                 decoded_payload,
+                preemptive_declaration=specified_encoding,
             )
         )
 

diff --git a/charset_normalizer/cli/__main__.py b/charset_normalizer/cli/__main__.py
@@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
         dest="force",
         help="Replace file without asking if you are sure, use this flag with caution.",
     )
+    parser.add_argument(
+        "-i",
+        "--no-preemptive",
+        action="store_true",
+        default=False,
+        dest="no_preemptive",
+        help="Disable looking at a charset declaration to hint the detector.",
+    )
     parser.add_argument(
         "-t",
         "--threshold",
@@ -133,31 +141,47 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
     args = parser.parse_args(argv)
 
     if args.replace is True and args.normalize is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("Use --replace in addition of --normalize only.", file=sys.stderr)
         return 1
 
     if args.force is True and args.replace is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("Use --force in addition of --replace only.", file=sys.stderr)
         return 1
 
     if args.threshold < 0.0 or args.threshold > 1.0:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
         return 1
 
     x_ = []
 
     for my_file in args.files:
-        matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
+        matches = from_fp(
+            my_file,
+            threshold=args.threshold,
+            explain=args.verbose,
+            preemptive_behaviour=args.no_preemptive is False,
+        )
 
         best_guess = matches.best()
 
         if best_guess is None:
             print(
                 'Unable to identify originating encoding for "{}". {}'.format(
                     my_file.name,
-                    "Maybe try increasing maximum amount of chaos."
-                    if args.threshold < 1.0
-                    else "",
+                    (
+                        "Maybe try increasing maximum amount of chaos."
+                        if args.threshold < 1.0
+                        else ""
+                    ),
                 ),
                 file=sys.stderr,
             )
@@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
                 try:
                     x_[0].unicode_path = join(dir_path, ".".join(o_))
 
-                    with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
-                        fp.write(str(best_guess))
+                    with open(x_[0].unicode_path, "wb") as fp:
+                        fp.write(best_guess.output())
                 except IOError as e:
                     print(str(e), file=sys.stderr)
                     if my_file.closed is False:

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -1,9 +1,10 @@
 from encodings.aliases import aliases
 from hashlib import sha256
 from json import dumps
+from re import sub
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
-from .constant import TOO_BIG_SEQUENCE
+from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
 from .utils import iana_name, is_multi_byte_encoding, unicode_range
 
 
@@ -16,6 +17,7 @@ def __init__(
         has_sig_or_bom: bool,
         languages: "CoherenceMatches",
         decoded_payload: Optional[str] = None,
+        preemptive_declaration: Optional[str] = None,
     ):
         self._payload: bytes = payload
 
@@ -33,6 +35,8 @@ def __init__(
 
         self._string: Optional[str] = decoded_payload
 
+        self._preemptive_declaration: Optional[str] = preemptive_declaration
+
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, CharsetMatch):
             if isinstance(other, str):
@@ -208,7 +212,24 @@ def output(self, encoding: str = "utf_8") -> bytes:
         """
         if self._output_encoding is None or self._output_encoding != encoding:
             self._output_encoding = encoding
-            self._output_payload = str(self).encode(encoding, "replace")
+            decoded_string = str(self)
+            if (
+                self._preemptive_declaration is not None
+                and self._preemptive_declaration.lower()
+                not in ["utf-8", "utf8", "utf_8"]
+            ):
+                patched_header = sub(
+                    RE_POSSIBLE_ENCODING_INDICATION,
+                    lambda m: m.string[m.span()[0] : m.span()[1]].replace(
+                        m.groups()[0], iana_name(self._output_encoding)  # type: ignore[arg-type]
+                    ),
+                    decoded_string[:8192],
+                    1,
+                )
+
+                decoded_string = patched_header + decoded_string[8192:]
+
+            self._output_payload = decoded_string.encode(encoding, "replace")
 
         return self._output_payload  # type: ignore
 

diff --git a/docs/community/featured.rst b/docs/community/featured.rst
@@ -9,10 +9,7 @@ your level or opinions.
 Niquests
 --------
 
-Started as a simple though..
-
-.. image:: https://i.imgflip.com/7xet0f.jpg
-   :width: 200
+Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not!
 
 Most of our programs that interact with HTTP server are built with ``requests`` and
 we aren't likely to switch without a substantial effort.

diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py
@@ -10,3 +10,33 @@ def test_unicode_edge_case():
 
     assert best_guess is not None, "Payload should have given something, detection failure"
     assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"
+
+
+def test_issue_gh520():
+    """Verify that minorities does not strip basic latin characters!"""
+    payload = b"/includes/webform.compon\xd2\xaants.inc/"
+
+    best_guess = from_bytes(payload).best()
+
+    assert best_guess is not None, "Payload should have given something, detection failure"
+    assert "Basic Latin" in best_guess.alphabets
+
+
+def test_issue_gh509():
+    """Two common ASCII punctuations should render as-is."""
+    payload = b");"
+
+    best_guess = from_bytes(payload).best()
+
+    assert best_guess is not None, "Payload should have given something, detection failure"
+    assert "ascii" == best_guess.encoding
+
+
+def test_issue_gh498():
+    """This case was mistaken for utf-16-le, this should never happen again."""
+    payload = b'\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx'
+
+    best_guess = from_bytes(payload).best()
+
+    assert best_guess is not None, "Payload should have given something, detection failure"
+    assert "Cyrillic" in best_guess.alphabets
diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py
@@ -1,6 +1,7 @@
 import pytest
 
 from charset_normalizer.utils import any_specified_encoding
+from charset_normalizer import CharsetMatch
 
 
 @pytest.mark.parametrize(
@@ -24,3 +25,42 @@ def test_detect_most_common_body_encoding(payload, expected_encoding):
     )
 
     assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body"
+
+
+@pytest.mark.parametrize(
+    "payload, expected_outcome",
+    [
+        (b'<?xml version="1.0" encoding="EUC-JP"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
+        (b'<html><head><meta charset="utf-8"></head></html>', b'<html><head><meta charset="utf-8"></head></html>'),
+        (b'<html><head><meta charset="utf-57"></head></html>', b'<html><head><meta charset="utf-57"></head></html>'),
+        (b'# coding: utf-8', b'# coding: utf-8'),
+        (b'<?xml version="1.0" encoding="UTF-8"?>', b'<?xml version="1.0" encoding="UTF-8"?>'),
+        (b'<?xml version="1.0" encoding="US-ASCII"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
+        (b'<?xml version="1.0" encoding="JohaB"?>', b'<?xml version="1.0" encoding="utf_8"?>'),
+        (b'<html><head><meta charset=WINDOWS-1252></head></html>', b'<html><head><meta charset=utf_8></head></html>'),
+        (b'<html><head><meta charset="WINDOWS-1256"></head></html>', b'<html><head><meta charset="utf_8"></head></html>'),
+    ]
+)
+def test_preemptive_mark_replacement(payload, expected_outcome):
+    """
+    When generating (to Unicode converted) bytes, we want to change any potential declarative charset
+    to utf-8. This test that.
+    """
+    specified_encoding = any_specified_encoding(
+        payload
+    )
+
+    detected_encoding = specified_encoding if specified_encoding is not None else "utf-8"
+
+    m = CharsetMatch(
+        payload,
+        detected_encoding,
+        0.,
+        False,
+        [],
+        preemptive_declaration=specified_encoding,
+    )
+
+    transformed_output = m.output()
+
+    assert transformed_output == expected_outcome