Skip to content

Commit

Permalink
Merge pull request #59 from lingsamuel/ignore-ascii
Browse files Browse the repository at this point in the history
Fix UTF-8 prober fullLen calculation, ignores basic ASCII characters
  • Loading branch information
aadsm authored Jun 30, 2020
2 parents 166cb1a + f619181 commit f88ae28
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 9 deletions.
12 changes: 10 additions & 2 deletions dist/jschardet.js
Original file line number Diff line number Diff line change
Expand Up @@ -7700,14 +7700,15 @@ function UTF8Prober() {
this._mNumOfMBChar = 0;
this._mMBCharLen = 0;
this._mFullLen = 0;
this._mBasicAsciiLen = 0;
}

this.getCharsetName = function() {
return "UTF-8";
}

this.feed = function(aBuf) {
this._mFullLen = aBuf.length;
this._mFullLen += aBuf.length;
for( var i = 0, c; i < aBuf.length; i++ ) {
c = aBuf[i];
var codingState = this._mCodingSM.nextState(c);
Expand All @@ -7721,6 +7722,8 @@ function UTF8Prober() {
if( this._mCodingSM.getCurrentCharLen() >= 2 ) {
this._mNumOfMBChar++;
this._mMBCharLen += this._mCodingSM.getCurrentCharLen();
} else if( c.charCodeAt(0) < 128 ) { // codes higher than 127 are extended ASCII
this._mBasicAsciiLen++;
}
}
}
Expand All @@ -7736,7 +7739,12 @@ function UTF8Prober() {

this.getConfidence = function() {
var unlike = 0.99;
if( this._mNumOfMBChar < 6 && (this._mMBCharLen / this._mFullLen) <= 0.6 ) {
var mbCharRatio = 0;
var nonBasciAsciiLen = (this._mFullLen - this._mBasicAsciiLen);
if( nonBasciAsciiLen > 0 ) {
mbCharRatio = this._mMBCharLen / nonBasciAsciiLen;
}
if( this._mNumOfMBChar < 6 && mbCharRatio <= 0.6 ) {
for( var i = 0; i < this._mNumOfMBChar; i++ ) {
unlike *= Math.pow(ONE_CHAR_PROB, this._mNumOfMBChar);
}
Expand Down
6 changes: 3 additions & 3 deletions dist/jschardet.min.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 10 additions & 2 deletions src/utf8prober.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,15 @@ function UTF8Prober() {
this._mNumOfMBChar = 0;
this._mMBCharLen = 0;
this._mFullLen = 0;
this._mBasicAsciiLen = 0;
}

this.getCharsetName = function() {
return "UTF-8";
}

this.feed = function(aBuf) {
this._mFullLen = aBuf.length;
this._mFullLen += aBuf.length;
for( var i = 0, c; i < aBuf.length; i++ ) {
c = aBuf[i];
var codingState = this._mCodingSM.nextState(c);
Expand All @@ -70,6 +71,8 @@ function UTF8Prober() {
if( this._mCodingSM.getCurrentCharLen() >= 2 ) {
this._mNumOfMBChar++;
this._mMBCharLen += this._mCodingSM.getCurrentCharLen();
} else if( c.charCodeAt(0) < 128 ) { // codes higher than 127 are extended ASCII
this._mBasicAsciiLen++;
}
}
}
Expand All @@ -85,7 +88,12 @@ function UTF8Prober() {

this.getConfidence = function() {
var unlike = 0.99;
if( this._mNumOfMBChar < 6 && (this._mMBCharLen / this._mFullLen) <= 0.6 ) {
var mbCharRatio = 0;
var nonBasciAsciiLen = (this._mFullLen - this._mBasicAsciiLen);
if( nonBasciAsciiLen > 0 ) {
mbCharRatio = this._mMBCharLen / nonBasciAsciiLen;
}
if( this._mNumOfMBChar < 6 && mbCharRatio <= 0.6 ) {
for( var i = 0; i < this._mNumOfMBChar; i++ ) {
unlike *= Math.pow(ONE_CHAR_PROB, this._mNumOfMBChar);
}
Expand Down
6 changes: 4 additions & 2 deletions tests/jschardet.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f88ae28

Please sign in to comment.