Fix UTF16ToString in TEXTDECODER=2 and MAXIMUM_MEMORY>=2GB (#24335)

RReverser · web-flow · commit 8f1c0e99e7ac · 2025-05-16T22:16:23.000Z
I accidentally noticed that Embind UTF-16 support was producing
incorrect results with MAXIMUM_MEMORY over 2GB.

Turns out, the `TextDecoder` implementation of `UTF16ToString` itself
was broken in this mode due to bitwise ops, so I've added new tests and
fixed this bug, slightly simplifying code in the process.
diff --git a/src/lib/libstrings.js b/src/lib/libstrings.js
@@ -44,6 +44,7 @@ addToLibrary({
     return UTF8Decoder.decode(heapOrArray.buffer ? {{{ getUnsharedTextDecoderView('heapOrArray', 'idx', 'endPtr') }}} : new Uint8Array(heapOrArray.slice(idx, endPtr)));
 #else // TEXTDECODER == 2
 #if TEXTDECODER
+    // When using conditional TextDecoder, skip it for short strings as the overhead of the native call is not worth it.
     if (endPtr - idx > 16 && heapOrArray.buffer && UTF8Decoder) {
       return UTF8Decoder.decode({{{ getUnsharedTextDecoderView('heapOrArray', 'idx', 'endPtr') }}});
     }
@@ -322,23 +323,22 @@ addToLibrary({
 #if ASSERTIONS
     assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
 #endif
+    var idx = {{{ getHeapOffset('ptr', 'u16') }}};
+    var maxIdx = idx + maxBytesToRead / 2;
 #if TEXTDECODER
-    var endPtr = ptr;
     // TextDecoder needs to know the byte length in advance, it doesn't stop on
     // null terminator by itself.
     // Also, use the length info to avoid running tiny strings through
     // TextDecoder, since .subarray() allocates garbage.
-    var idx = endPtr >> 1;
-    var maxIdx = idx + maxBytesToRead / 2;
+    var endIdx = idx;
     // If maxBytesToRead is not passed explicitly, it will be undefined, and this
     // will always evaluate to true. This saves on code size.
-    while (!(idx >= maxIdx) && HEAPU16[idx]) ++idx;
-    endPtr = idx << 1;
+    while (!(endIdx >= maxIdx) && HEAPU16[endIdx]) ++endIdx;
 
 #if TEXTDECODER != 2
-    if (endPtr - ptr > 32 && UTF16Decoder)
+    if (endIdx - idx > 16 && UTF16Decoder)
 #endif // TEXTDECODER != 2
-      return UTF16Decoder.decode({{{ getUnsharedTextDecoderView('HEAPU8', 'ptr', 'endPtr') }}});
+      return UTF16Decoder.decode({{{ getUnsharedTextDecoderView('HEAPU16', 'idx', 'endIdx') }}});
 #endif // TEXTDECODER
 
 #if TEXTDECODER != 2
@@ -348,8 +348,8 @@ addToLibrary({
     // If maxBytesToRead is not passed explicitly, it will be undefined, and the
     // for-loop's condition will always evaluate to true. The loop is then
     // terminated on the first null char.
-    for (var i = 0; !(i >= maxBytesToRead / 2); ++i) {
-      var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
+    for (var i = idx; !(i >= maxIdx); ++i) {
+      var codeUnit = HEAPU16[i];
       if (codeUnit == 0) break;
       // fromCharCode constructs a character from a UTF-16 code unit, so we can
       // pass the UTF16 string right through.
diff --git a/test/code_size/embind_hello_wasm.json b/test/code_size/embind_hello_wasm.json
@@ -2,9 +2,9 @@
   "a.html": 552,
   "a.html.gz": 380,
   "a.js": 8831,
-  "a.js.gz": 3897,
+  "a.js.gz": 3900,
   "a.wasm": 7344,
   "a.wasm.gz": 3368,
   "total": 16727,
-  "total_gz": 7645
+  "total_gz": 7648
 }
diff --git a/test/other/test_unoptimized_code_size.js.size b/test/other/test_unoptimized_code_size.js.size
@@ -1 +1 @@
-53704
+53827
diff --git a/test/other/test_unoptimized_code_size_no_asserts.js.size b/test/other/test_unoptimized_code_size_no_asserts.js.size
@@ -1 +1 @@
-26959
+27082
diff --git a/test/other/test_unoptimized_code_size_strict.js.size b/test/other/test_unoptimized_code_size_strict.js.size
@@ -1 +1 @@
-51754
+51877
diff --git a/test/test_core.py b/test/test_core.py
@@ -339,6 +339,19 @@ def decorated(self, *args, **kwargs):
   return outer_decorator
 
 
+def with_both_text_decoder(f):
+  assert callable(f)
+
+  @wraps(f)
+  def decorated(self, textdecoder, *args, **kwargs):
+    self.set_setting('TEXTDECODER', textdecoder)
+    f(self, *args, **kwargs)
+
+  parameterize(decorated, {'': (0,), 'textdecoder': (2,)})
+
+  return decorated
+
+
 no_minimal_runtime = make_no_decorator_for_setting('MINIMAL_RUNTIME')
 no_safe_heap = make_no_decorator_for_setting('SAFE_HEAP')
 
@@ -5655,40 +5668,43 @@ def test_utime(self):
   def test_futimens(self):
     self.do_runf('utime/test_futimens.c', 'success')
 
+  @with_both_text_decoder
   def test_utf(self):
     self.do_core_test('test_utf.c')
 
+  @with_both_text_decoder
   def test_utf32(self):
     self.do_runf('utf32.cpp', 'OK (long).\n')
 
+  @with_both_text_decoder
   @no_sanitize('requires libc to be built with -fshort-char')
   def test_utf32_short_wchar(self):
     if '-flto' in self.emcc_args or '-flto=thin' in self.emcc_args:
       self.skipTest('-fshort-wchar is not compatible with LTO (libraries would need rebuilting)')
     self.do_runf('utf32.cpp', 'OK (short).\n', emcc_args=['-fshort-wchar'])
 
+  @with_both_text_decoder
   @crossplatform
   def test_utf16(self):
     self.do_runf('core/test_utf16.cpp', 'OK.')
 
+  @with_both_text_decoder
   def test_utf8(self):
     self.do_runf('core/test_utf8.c', 'OK.')
 
+  @with_both_text_decoder
   @also_with_wasm_bigint
-  def test_utf8_textdecoder(self):
+  def test_utf8_bench(self):
     self.emcc_args += ['--embed-file', test_file('utf8_corpus.txt') + '@/utf8_corpus.txt']
     self.do_runf('benchmark/benchmark_utf8.c', 'OK.')
 
   # Test that invalid character in UTF8 does not cause decoding to crash.
+  @with_both_text_decoder
   @also_with_minimal_runtime
-  @parameterized({
-    '': ([],),
-    'textdecoder': (['-sTEXTDECODER'],),
-  })
-  def test_utf8_invalid(self, args):
-    self.do_runf('test_utf8_invalid.c', 'OK.', emcc_args=args)
+  def test_utf8_invalid(self):
+    self.do_runf('test_utf8_invalid.c', 'OK.')
 
-  def test_utf16_textdecoder(self):
+  def test_utf16_bench(self):
     self.emcc_args += ['--embed-file', test_file('utf16_corpus.txt') + '@/utf16_corpus.txt']
     self.do_runf('benchmark/benchmark_utf16.cpp', 'OK.')