Handle split surrogate pairs when encoding utf8 (fixes #250)

Handle a case where streaming utf8 encoder (converting js strings -> buffers) encounters surrogate pairs split between chunks (last character of one chunk is high surrogate and first character of the next chunk is a low surrogate). Strictly speaking, this is an invalid input, but it's nice to handle it anyway if we can.
ashtuchkin · Sep 18, 2021 · 1d8d89e · 1d8d89e
1 parent cab28eb
commit 1d8d89e
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 5 deletions.
diff --git a/encodings/internal.js b/encodings/internal.js
@@ -28,6 +28,8 @@ function InternalCodec(codecOptions, iconv) {
 
     if (this.enc === "base64")
         this.encoder = InternalEncoderBase64;
+    else if (this.enc === "utf8")
+        this.encoder = InternalEncoderUtf8;
     else if (this.enc === "cesu8") {
         this.enc = "utf8"; // Use utf8 for decoding.
         this.encoder = InternalEncoderCesu8;
@@ -196,3 +198,35 @@ InternalDecoderCesu8.prototype.end = function() {
         res += this.defaultCharUnicode;
     return res;
 }
+
+//------------------------------------------------------------------------------
+// check the chunk boundaries for surrogate pair
+
+function InternalEncoderUtf8(options, codec) {
+    this.highSurrogate = '';
+}
+
+InternalEncoderUtf8.prototype.write = function (str) {
+    if (this.highSurrogate) {
+        str = this.highSurrogate + str;
+        this.highSurrogate = '';
+    }
+
+    if (str.length > 0) {
+        var charCode = str.charCodeAt(str.length - 1);
+        if (0xd800 <= charCode && charCode < 0xdc00) {
+            this.highSurrogate = str[str.length - 1];
+            str = str.slice(0, str.length - 1);
+        }
+    }
+
+    return Buffer.from(str, this.enc);
+}
+
+InternalEncoderUtf8.prototype.end = function () {
+    if (this.highSurrogate) {
+        var str = this.highSurrogate;
+        this.highSurrogate = '';
+        return Buffer.from(str, this.enc);
+    }
+}
diff --git a/test/streams-test.js b/test/streams-test.js
@@ -232,11 +232,6 @@ describe("Streaming mode", function() {
         output: "e4b882",
     }));
 
-    it("Encoding using internal modules: utf8 with surrogates", checkEncodeStream({
-        encoding: "utf8",
-        input: ["\uD83D\uDE3B"],
-        output: "f09f98bb",
-    }));
 
     it("Decoding of incomplete chars in DBCS (gbk)", checkDecodeStream({
         encoding: "gbk",
@@ -331,3 +326,32 @@ describe("Streaming sugar", function() {
     });
 });
 
+describe("Encoding using internal modules with surrogates in separate chunks:", function () {
+    function checkUtf8EncodeStream (input) {
+        return checkEncodeStream({
+            encoding: "utf8",
+            input: input,
+            output: Buffer.from(input.join(''), 'utf8').toString('hex')
+        })
+    }
+
+    it("a single string", checkUtf8EncodeStream(["\uD83D\uDE3B"]))
+
+    it("normal", checkUtf8EncodeStream(["\uD83D", "\uDE3B"]))
+
+    it("reverse", checkUtf8EncodeStream(["\uDE3B", "\uD83D"]))
+
+    it("multiple surrogates", checkUtf8EncodeStream(["\uD83D", "\uDE3B\uD83D", "\uDE3B"]))
+
+    it("more than one character with left", checkUtf8EncodeStream(["abc\uD83D", "\uDE3B"]))
+
+    it("more than one character with right", checkUtf8EncodeStream(["\uD83D", "\uDE3Befg"]))
+
+    it("more than one character at both ends", checkUtf8EncodeStream(["abc\uD83D", "\uDE3Befg"]))
+
+    it("surrogates pair be interrupted", checkUtf8EncodeStream(["abc\uD83D", "efg\uDE3B"]))
+
+    it("a half of surrogates pair only left", checkUtf8EncodeStream(["abc\uD83D"]))
+
+    it("a half of surrogates pair only right", checkUtf8EncodeStream(["\uDE3Befg"]))
+});