Release 1.3.0 (#17)
* Backport unicodedata for v12 impl into python if available

* Add aliases to CharsetNormalizerMatches class

* Add feature preemptive behaviour, looking for encoding declaration

* import aliases in __init__

* Change text in Why. More concise.

* bump 1.3.0

* initial docs

work in progress
Ousret authored Sep 30, 2019
1 parent f44ecb6 commit a2a4682
@@ -106,9 +106,8 @@ See wiki for advanced usages. *Todo, not yet available.*

## 😇 Why

When I started using Chardet, I noticed that this library was wrong most of the time
when it's not about Unicode, Gb or Big5. That because some charset are easily identifiable
because of there standards and Chardet does a really good job at identifying them.
When I started using Chardet, I noticed that this library was unreliable nowadays and also
it's unmaintained, and most likely will never be.

I **don't care** about the **originating charset** encoding, that because **two different table** can
produce **two identical file.**
3 changes: 2 additions & 1 deletion charset_normalizer/
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding: utf-8
from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch
from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch, \
CharsetDetector, CharsetDoctor, EncodingDetector # Aliases
from charset_normalizer.unicode import UnicodeRangeIdentify
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.probe_coherence import ProbeCoherence
6 changes: 6 additions & 0 deletions charset_normalizer/
Original file line number Diff line number Diff line change
@@ -12,3 +12,9 @@ def charset_normalizer_hook(exctype, value, traceback):

sys.excepthook = charset_normalizer_hook

import unicodedata2
sys.modules['unicodedata'] = unicodedata2
except ImportError:
39 changes: 33 additions & 6 deletions charset_normalizer/
Original file line number Diff line number Diff line change
@@ -15,6 +15,8 @@

from charset_normalizer.encoding import is_multi_byte_encoding

from charset_normalizer.probe_inherent_sign import any_specified_encoding

from loguru import logger

from hashlib import sha256
@@ -319,15 +321,16 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.20):
return b_

def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
charset encoding.
Will test input like this (with steps=4 & chunk_size=4) --> [#### #### #### ####]
:param bytes sequences: Actual sequence of bytes to analyse
:param float threshold: Maximum amount of chaos allowed on first pass
:param int chunk_size: Size to extract and analyse in each step
:param int steps: Number of steps
:param int steps: Number of steps/block to extract from sequence
:param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
:param bool explain: Print on screen what is happening when searching for a match
:param list[str] cp_isolation: Finite list of encoding to use when searching for a match
:param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
@@ -381,6 +384,13 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
tested = set()
matches = list()

specified_encoding = any_specified_encoding(sequences) if preemptive_behaviour is True else None

if specified_encoding is not None:
'Trying to detect encoding on a sequence that seems to declare a encoding ({}).'.format(specified_encoding)

for support in supported:

k, p = support
@@ -493,8 +503,16 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation

if specified_encoding is not None and p == specified_encoding:'{encoding} is most likely the one. '
'Because it is specified in analysed byte sequence and '
'initial test passed successfully. '
'Disable this behaviour by setting preemptive_behaviour '
'to False', encoding=specified_encoding)
return CharsetNormalizerMatches([cnm]) if any(fingerprint_tests) is False else CharsetNormalizerMatches([matches[fingerprint_tests.index(True)]])

if (p == 'ascii' and chaos_median == 0.) or bom_available is True:'{encoding} is the most likely the one. {bom_available}','{encoding} is most likely the one. {bom_available}',
bom_available='BOM/SIG available' if bom_available else '')

@@ -503,13 +521,14 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
return CharsetNormalizerMatches(matches)

def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
:param io.BinaryIO fp:
:param int steps:
:param int chunk_size:
:param float threshold:
:param bool explain: Print on screen what is happening when searching for a match
:param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
:param list[str] cp_isolation: Finite list of encoding to use when searching for a match
:param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
:return: List of potential matches
@@ -522,24 +541,26 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_

def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
:param str path:
:param int steps:
:param int chunk_size:
:param float threshold:
:param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
:param bool explain: Print on screen what is happening when searching for a match
:param list[str] cp_isolation: Finite list of encoding to use when searching for a match
:param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
:return: List of potential matches
:rtype: CharsetNormalizerMatches
with open(path, 'rb') as fp:
return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, explain)
return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain)

def could_be_from_charset(self):
@@ -596,3 +617,9 @@ def best(self):
return CharsetNormalizerMatches(

# Some aliases to CharsetNormalizerMatches, because it is too long for a class name.
CharsetDetector = CharsetNormalizerMatches
EncodingDetector = CharsetNormalizerMatches
CharsetDoctor = CharsetNormalizerMatches
39 changes: 39 additions & 0 deletions charset_normalizer/
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from re import findall, compile, IGNORECASE
from encodings.aliases import aliases

r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',

def any_specified_encoding(sequence):
Search in sequence (ASCII-mode) if there is any sign of declared encoding.
:param bytes sequence:
:return: Declared encoding if any else None
:rtype: str
if not isinstance(sequence, bytes) and not isinstance(sequence, bytearray):
raise TypeError

seq_len = len(sequence)

results = findall(
sequence[:seq_len if seq_len <= 2048 else int(seq_len*0.3)].decode('ascii', errors='ignore')
) # type: list[str]

if len(results) == 0:
return None

for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace('-', '_')

for a, b in aliases.items():
if a == specified_encoding:
return b
if b == specified_encoding:
return b

return None
