Skip to content

Commit

Permalink
Convert dbcs codec and tests (#256)
Browse files Browse the repository at this point in the history
  • Loading branch information
gyzerok authored Jul 26, 2020
1 parent a1bd8f7 commit 5d99a92
Show file tree
Hide file tree
Showing 8 changed files with 673 additions and 628 deletions.
1,014 changes: 513 additions & 501 deletions encodings/dbcs-codec.js

Large diffs are not rendered by default.

File renamed without changes.
41 changes: 41 additions & 0 deletions generation/gen-gbk-big5-fixtures.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"use strict";

const Iconv = require("iconv").Iconv,
fs = require("fs"),
path = require("path"),
utils = require("../test/utils");

const fixtures = {
big5: big5(),
gbk: gbk(),
};
const outputFile = path.resolve(__dirname, "..", "test", "fixtures", "gbk-big5.json");
fs.writeFileSync(outputFile, JSON.stringify(fixtures));

function gbk() {
const inputFile = path.resolve(__dirname, "fixtures", "gbkFile.txt");
const contentBuffer = fs.readFileSync(inputFile);

const codec = Iconv("GBK", "utf8");
const str = codec.convert(contentBuffer).toString();

return {
bytes: utils.hex(contentBuffer, true),
string: str,
};
}

function big5() {
const contentBuffer = Buffer.from(
"PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+",
"base64"
);

const codec = Iconv("big5", "utf8");
const str = codec.convert(contentBuffer).toString();

return {
bytes: utils.hex(contentBuffer, true),
string: str,
};
}
65 changes: 31 additions & 34 deletions test/big5-test.js
Original file line number Diff line number Diff line change
@@ -1,71 +1,68 @@
"use strict";

var assert = require("assert"),
Buffer = require("safer-buffer").Buffer,
iconv = require("../");
const assert = require("assert"),
utils = require("./utils"),
fixtures = require("./fixtures/gbk-big5.json"),
iconv = utils.requireIconv();

var testString = "中文abc", //unicode contains Big5-code and ascii
testStringBig5Buffer = Buffer.from([0xa4, 0xa4, 0xa4, 0xe5, 0x61, 0x62, 0x63]),
const testString = "中文abc", //unicode contains Big5-code and ascii
testStringBig5Buffer = utils.bytes("a4 a4 a4 e5 61 62 63"),
testString2 = "測試",
testStringBig5Buffer2 = Buffer.from([0xb4, 0xfa, 0xb8, 0xd5]);
testStringBig5Buffer2 = utils.bytes("b4 fa b8 d5");

describe("Big5 tests", function () {
describe("Big5 tests #node-web", function () {
it("Big5 correctly encoded/decoded", function () {
assert.strictEqual(
iconv.encode(testString, "big5").toString("hex"),
testStringBig5Buffer.toString("hex")
utils.hex(iconv.encode(testString, "big5")),
utils.hex(testStringBig5Buffer)
);
assert.strictEqual(iconv.decode(testStringBig5Buffer, "big5"), testString);
assert.strictEqual(
iconv.encode(testString2, "big5").toString("hex"),
testStringBig5Buffer2.toString("hex")
utils.hex(iconv.encode(testString2, "big5")),
utils.hex(testStringBig5Buffer2)
);
assert.strictEqual(iconv.decode(testStringBig5Buffer2, "big5"), testString2);
});

it("cp950 correctly encoded/decoded", function () {
assert.strictEqual(
iconv.encode(testString, "cp950").toString("hex"),
testStringBig5Buffer.toString("hex")
utils.hex(iconv.encode(testString, "cp950")),
utils.hex(testStringBig5Buffer)
);
assert.strictEqual(iconv.decode(testStringBig5Buffer, "cp950"), testString);
});

it("Big5 file read decoded,compare with iconv result", function () {
var contentBuffer = Buffer.from(
"PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+",
"base64"
);
var str = iconv.decode(contentBuffer, "big5");
var iconvc = new (require("iconv").Iconv)("big5", "utf8");
assert.strictEqual(iconvc.convert(contentBuffer).toString(), str);
const contentBuffer = utils.bytes(fixtures.big5.bytes);
const str = iconv.decode(contentBuffer, "big5");
assert.strictEqual(fixtures.big5.string, str);
});

it("Big5 correctly decodes and encodes characters · and ×", function () {
// https://github.com/ashtuchkin/iconv-lite/issues/13
// Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT
var chars = "·×";
var big5Chars = Buffer.from([0xa1, 0x50, 0xa1, 0xd1]);
assert.strictEqual(iconv.encode(chars, "big5").toString("hex"), big5Chars.toString("hex"));
const chars = "·×";
const big5Chars = utils.bytes("a1 50 a1 d1");
assert.strictEqual(utils.hex(iconv.encode(chars, "big5")), utils.hex(big5Chars));
assert.strictEqual(iconv.decode(big5Chars, "big5"), chars);
});

it("Big5 correctly encodes & decodes sequences", function () {
assert.strictEqual(iconv.encode("\u00CA\u0304", "big5").toString("hex"), "8862");
assert.strictEqual(iconv.encode("\u00EA\u030C", "big5").toString("hex"), "88a5");
assert.strictEqual(iconv.encode("\u00CA", "big5").toString("hex"), "8866");
assert.strictEqual(iconv.encode("\u00CA\u00CA", "big5").toString("hex"), "88668866");
assert.strictEqual(utils.hex(iconv.encode("\u00CA\u0304", "big5")), "88 62");
assert.strictEqual(utils.hex(iconv.encode("\u00EA\u030C", "big5")), "88 a5");
assert.strictEqual(utils.hex(iconv.encode("\u00CA", "big5")), "88 66");
assert.strictEqual(utils.hex(iconv.encode("\u00CA\u00CA", "big5")), "88 66 88 66");

assert.strictEqual(iconv.encode("\u00CA\uD800", "big5").toString("hex"), "88663f"); // Unfinished surrogate.
assert.strictEqual(iconv.encode("\u00CA\uD841\uDD47", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇').
assert.strictEqual(iconv.encode("\u00CA𠕇", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇').
assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD800", "big5")), "88 66 3f"); // Unfinished surrogate.
assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD841\uDD47", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇').
assert.strictEqual(utils.hex(iconv.encode("\u00CA𠕇", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇').

assert.strictEqual(iconv.decode(Buffer.from("8862", "hex"), "big5"), "\u00CA\u0304");
assert.strictEqual(iconv.decode(Buffer.from("8866", "hex"), "big5"), "\u00CA");
assert.strictEqual(iconv.decode(Buffer.from("8866fa40", "hex"), "big5"), "\u00CA𠕇");
assert.strictEqual(iconv.decode(utils.bytes("88 62"), "big5"), "\u00CA\u0304");
assert.strictEqual(iconv.decode(utils.bytes("88 66"), "big5"), "\u00CA");
assert.strictEqual(iconv.decode(utils.bytes("88 66 fa 40"), "big5"), "\u00CA𠕇");
});

it("Big5 correctly encodes 十", function () {
assert.strictEqual(iconv.encode("十", "big5").toString("hex"), "a451");
assert.strictEqual(utils.hex(iconv.encode("十", "big5")), "a4 51");
});
});
10 changes: 10 additions & 0 deletions test/fixtures/gbk-big5.json

Large diffs are not rendered by default.

130 changes: 56 additions & 74 deletions test/gbk-test.js
Original file line number Diff line number Diff line change
@@ -1,55 +1,51 @@
"use strict";

var fs = require("fs"),
assert = require("assert"),
Buffer = require("safer-buffer").Buffer,
iconv = require("../");
const assert = require("assert"),
utils = require("./utils"),
fixtures = require("./fixtures/gbk-big5.json"),
iconv = utils.requireIconv();

var testString = "中国abc", //unicode contains GBK-code and ascii
testStringGBKBuffer = Buffer.from([0xd6, 0xd0, 0xb9, 0xfa, 0x61, 0x62, 0x63]);
const testString = "中国abc", //unicode contains GBK-code and ascii
testStringGBKBuffer = utils.bytes("d6 d0 b9 fa 61 62 63");

describe("GBK tests", function () {
describe("GBK tests #node-web", function () {
it("GBK correctly encoded/decoded", function () {
assert.strictEqual(
iconv.encode(testString, "GBK").toString("binary"),
testStringGBKBuffer.toString("binary")
utils.hex(iconv.encode(testString, "GBK")),
utils.hex(testStringGBKBuffer)
);
assert.strictEqual(iconv.decode(testStringGBKBuffer, "GBK"), testString);
});

it("GB2312 correctly encoded/decoded", function () {
assert.strictEqual(
iconv.encode(testString, "GB2312").toString("binary"),
testStringGBKBuffer.toString("binary")
utils.hex(iconv.encode(testString, "GB2312")),
utils.hex(testStringGBKBuffer)
);
assert.strictEqual(iconv.decode(testStringGBKBuffer, "GB2312"), testString);
});

it("GBK file read decoded,compare with iconv result", function () {
var contentBuffer = fs.readFileSync(__dirname + "/gbkFile.txt");
var str = iconv.decode(contentBuffer, "GBK");
var iconvc = new (require("iconv").Iconv)("GBK", "utf8");
assert.strictEqual(iconvc.convert(contentBuffer).toString(), str);
const contentBuffer = utils.bytes(fixtures.gbk.bytes);
const str = iconv.decode(contentBuffer, "GBK");
assert.strictEqual(fixtures.gbk.string, str);
});

it("GBK correctly decodes and encodes characters · and ×", function () {
// https://github.com/ashtuchkin/iconv-lite/issues/13
// Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT
var chars = "·×";
var gbkChars = Buffer.from([0xa1, 0xa4, 0xa1, 0xc1]);
assert.strictEqual(
iconv.encode(chars, "GBK").toString("binary"),
gbkChars.toString("binary")
);
const chars = "·×";
const gbkChars = utils.bytes("a1 a4 a1 c1");
assert.strictEqual(utils.hex(iconv.encode(chars, "GBK")), utils.hex(gbkChars));
assert.strictEqual(iconv.decode(gbkChars, "GBK"), chars);
});

it("GBK and GB18030 correctly decodes and encodes Euro character", function () {
// Euro character (U+20AC) has two encodings in GBK family: 0x80 and 0xA2 0xE3
// According to W3C's technical recommendation (https://www.w3.org/TR/encoding/#gbk-encoder),
// Both GBK and GB18030 decoders should accept both encodings.
var gbkEuroEncoding1 = Buffer.from([0x80]),
gbkEuroEncoding2 = Buffer.from([0xa2, 0xe3]),
const gbkEuroEncoding1 = utils.bytes("80"),
gbkEuroEncoding2 = utils.bytes("a2 e3"),
strEuro = "€";

assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GBK"), strEuro);
Expand All @@ -58,13 +54,10 @@ describe("GBK tests", function () {
assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GB18030"), strEuro);

// But when decoding, GBK should produce 0x80, but GB18030 - 0xA2 0xE3.
assert.strictEqual(utils.hex(iconv.encode(strEuro, "GBK")), utils.hex(gbkEuroEncoding1));
assert.strictEqual(
iconv.encode(strEuro, "GBK").toString("hex"),
gbkEuroEncoding1.toString("hex")
);
assert.strictEqual(
iconv.encode(strEuro, "GB18030").toString("hex"),
gbkEuroEncoding2.toString("hex")
utils.hex(iconv.encode(strEuro, "GB18030")),
utils.hex(gbkEuroEncoding2)
);
});

Expand Down Expand Up @@ -92,65 +85,54 @@ describe("GBK tests", function () {
);
});

function swapBytes(buf) {
for (var i = 0; i < buf.length; i += 2) buf.writeUInt16LE(buf.readUInt16BE(i), i);
return buf;
}
function spacify4(str) {
return str.replace(/(....)/g, "$1 ").trim();
}
function strToHex(str) {
return spacify4(swapBytes(Buffer.from(str, "ucs2")).toString("hex"));
}

it("GB18030 encodes/decodes 4 byte sequences", function () {
var chars = {
"\u0080": Buffer.from([0x81, 0x30, 0x81, 0x30]),
"\u0081": Buffer.from([0x81, 0x30, 0x81, 0x31]),
"\u008b": Buffer.from([0x81, 0x30, 0x82, 0x31]),
"\u0615": Buffer.from([0x81, 0x31, 0x82, 0x31]),
: Buffer.from([0x82, 0x31, 0x82, 0x31]),
"\udbd9\ude77": Buffer.from([0xe0, 0x31, 0x82, 0x31]),
const chars = {
"\u0080": utils.bytes("81 30 81 30"),
"\u0081": utils.bytes("81 30 81 31"),
"\u008b": utils.bytes("81 30 82 31"),
"\u0615": utils.bytes("81 31 82 31"),
: utils.bytes("82 31 82 31"),
"\udbd9\ude77": utils.bytes("e0 31 82 31"),
};
for (var uChar in chars) {
var gbkBuf = chars[uChar];
for (const uChar in chars) {
const gbkBuf = chars[uChar];
assert.strictEqual(utils.hex(iconv.encode(uChar, "GB18030")), utils.hex(gbkBuf));
assert.strictEqual(
iconv.encode(uChar, "GB18030").toString("hex"),
gbkBuf.toString("hex")
utils.strToHex(iconv.decode(gbkBuf, "GB18030")),
utils.strToHex(uChar)
);
assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar));
}
});

it("GB18030 correctly decodes incomplete 4 byte sequences", function () {
var chars = {
"�": Buffer.from([0x82]),
"�1": Buffer.from([0x82, 0x31]),
"�1�": Buffer.from([0x82, 0x31, 0x82]),
: Buffer.from([0x82, 0x31, 0x82, 0x31]),
"� ": Buffer.from([0x82, 0x20]),
"�1 ": Buffer.from([0x82, 0x31, 0x20]),
"�1� ": Buffer.from([0x82, 0x31, 0x82, 0x20]),
"\u399f ": Buffer.from([0x82, 0x31, 0x82, 0x31, 0x20]),
"�1\u4fdb": Buffer.from([0x82, 0x31, 0x82, 0x61]),
"�1\u5010\u0061": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x61]),
㦟俛: Buffer.from([0x82, 0x31, 0x82, 0x31, 0x82, 0x61]),
"�1\u50101�1": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x31, 0x82, 0x31]),
const chars = {
"�": utils.bytes("82"),
"�1": utils.bytes("82 31"),
"�1�": utils.bytes("82 31 82"),
: utils.bytes("82 31 82 31"),
"� ": utils.bytes("82 20"),
"�1 ": utils.bytes("82 31 20"),
"�1� ": utils.bytes("82 31 82 20"),
"\u399f ": utils.bytes("82 31 82 31 20"),
"�1\u4fdb": utils.bytes("82 31 82 61"),
"�1\u5010\u0061": utils.bytes("82 31 82 82 61"),
㦟俛: utils.bytes("82 31 82 31 82 61"),
"�1\u50101�1": utils.bytes("82 31 82 82 31 82 31"),
};
for (var uChar in chars) {
var gbkBuf = chars[uChar];
assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar));
for (const uChar in chars) {
const gbkBuf = chars[uChar];
assert.strictEqual(
utils.strToHex(iconv.decode(gbkBuf, "GB18030")),
utils.strToHex(uChar)
);
}
});

it("GB18030:2005 changes are applied", function () {
// See https://github.com/whatwg/encoding/issues/22
var chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator
var gbkChars = Buffer.from([0xa8, 0xbc, 0x00, 0x81, 0x35, 0xf4, 0x37]);
const chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator
const gbkChars = utils.bytes("a8 bc 00 81 35 f4 37");
assert.strictEqual(iconv.decode(gbkChars, "GB18030"), chars);
assert.strictEqual(
iconv.encode(chars, "GB18030").toString("hex"),
gbkChars.toString("hex")
);
assert.strictEqual(utils.hex(iconv.encode(chars, "GB18030")), utils.hex(gbkChars));
});
});
Loading

0 comments on commit 5d99a92

Please sign in to comment.