diff --git a/CHANGELOG.md b/CHANGELOG.md index a8a53576..eba4d0dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch. -- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) +- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537) - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381) ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index a51ee35e..70a90182 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -159,6 +159,8 @@ def from_bytes( results: CharsetMatches = CharsetMatches() + early_stop_results: CharsetMatches = CharsetMatches() + sig_encoding, sig_payload = identify_sig_or_bom(sequences) if sig_encoding is not None: @@ -431,29 +433,47 @@ def from_bytes( ), ) - results.append( - CharsetMatch( - sequences, - encoding_iana, - mean_mess_ratio, - bom_or_sig_available, - cd_ratios_merged, - decoded_payload, - preemptive_declaration=specified_encoding, - ) + current_match = CharsetMatch( + sequences, + encoding_iana, + mean_mess_ratio, + bom_or_sig_available, + cd_ratios_merged, + ( + decoded_payload + if ( + is_too_large_sequence is False + or encoding_iana in [specified_encoding, "ascii", "utf_8"] + ) + else None + ), + preemptive_declaration=specified_encoding, ) + results.append(current_match) + if ( encoding_iana in [specified_encoding, "ascii", "utf_8"] and mean_mess_ratio < 0.1 ): + early_stop_results.append(current_match) + + if ( + len(early_stop_results) + and (specified_encoding is None or specified_encoding in tested) + and "ascii" in tested + and "utf_8" in tested + ): + probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment] logger.debug( - "Encoding detection: %s is most likely the one.", encoding_iana + "Encoding detection: %s is most likely the one.", + probable_result.encoding, ) if explain: logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) - return CharsetMatches([results[encoding_iana]]) + + return CharsetMatches([probable_result]) if encoding_iana == sig_encoding: logger.debug( diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index ee5681ca..6f6b86b3 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -285,7 +285,7 @@ def append(self, item: CharsetMatch) -> None: ) ) # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) - if len(item.raw) <= TOO_BIG_SEQUENCE: + if len(item.raw) < TOO_BIG_SEQUENCE: for match in self._results: if match.fingerprint == item.fingerprint and match.chaos == item.chaos: match.add_submatch(item)