Skip to content

Commit 8f1c0e9

Browse files
authored
Fix UTF16ToString in TEXTDECODER=2 and MAXIMUM_MEMORY>=2GB (#24335)
I accidentally noticed that Embind UTF-16 support was producing incorrect results with MAXIMUM_MEMORY over 2GB. Turns out, the `TextDecoder` implementation of `UTF16ToString` itself was broken in this mode due to bitwise ops, so I've added new tests and fixed this bug, slightly simplifying code in the process.
1 parent 5fd73d6 commit 8f1c0e9

6 files changed

+38
-22
lines changed

src/lib/libstrings.js

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ addToLibrary({
4444
return UTF8Decoder.decode(heapOrArray.buffer ? {{{ getUnsharedTextDecoderView('heapOrArray', 'idx', 'endPtr') }}} : new Uint8Array(heapOrArray.slice(idx, endPtr)));
4545
#else // TEXTDECODER == 2
4646
#if TEXTDECODER
47+
// When using conditional TextDecoder, skip it for short strings as the overhead of the native call is not worth it.
4748
if (endPtr - idx > 16 && heapOrArray.buffer && UTF8Decoder) {
4849
return UTF8Decoder.decode({{{ getUnsharedTextDecoderView('heapOrArray', 'idx', 'endPtr') }}});
4950
}
@@ -322,23 +323,22 @@ addToLibrary({
322323
#if ASSERTIONS
323324
assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
324325
#endif
326+
var idx = {{{ getHeapOffset('ptr', 'u16') }}};
327+
var maxIdx = idx + maxBytesToRead / 2;
325328
#if TEXTDECODER
326-
var endPtr = ptr;
327329
// TextDecoder needs to know the byte length in advance, it doesn't stop on
328330
// null terminator by itself.
329331
// Also, use the length info to avoid running tiny strings through
330332
// TextDecoder, since .subarray() allocates garbage.
331-
var idx = endPtr >> 1;
332-
var maxIdx = idx + maxBytesToRead / 2;
333+
var endIdx = idx;
333334
// If maxBytesToRead is not passed explicitly, it will be undefined, and this
334335
// will always evaluate to true. This saves on code size.
335-
while (!(idx >= maxIdx) && HEAPU16[idx]) ++idx;
336-
endPtr = idx << 1;
336+
while (!(endIdx >= maxIdx) && HEAPU16[endIdx]) ++endIdx;
337337

338338
#if TEXTDECODER != 2
339-
if (endPtr - ptr > 32 && UTF16Decoder)
339+
if (endIdx - idx > 16 && UTF16Decoder)
340340
#endif // TEXTDECODER != 2
341-
return UTF16Decoder.decode({{{ getUnsharedTextDecoderView('HEAPU8', 'ptr', 'endPtr') }}});
341+
return UTF16Decoder.decode({{{ getUnsharedTextDecoderView('HEAPU16', 'idx', 'endIdx') }}});
342342
#endif // TEXTDECODER
343343

344344
#if TEXTDECODER != 2
@@ -348,8 +348,8 @@ addToLibrary({
348348
// If maxBytesToRead is not passed explicitly, it will be undefined, and the
349349
// for-loop's condition will always evaluate to true. The loop is then
350350
// terminated on the first null char.
351-
for (var i = 0; !(i >= maxBytesToRead / 2); ++i) {
352-
var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
351+
for (var i = idx; !(i >= maxIdx); ++i) {
352+
var codeUnit = HEAPU16[i];
353353
if (codeUnit == 0) break;
354354
// fromCharCode constructs a character from a UTF-16 code unit, so we can
355355
// pass the UTF16 string right through.

test/code_size/embind_hello_wasm.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
"a.html": 552,
33
"a.html.gz": 380,
44
"a.js": 8831,
5-
"a.js.gz": 3897,
5+
"a.js.gz": 3900,
66
"a.wasm": 7344,
77
"a.wasm.gz": 3368,
88
"total": 16727,
9-
"total_gz": 7645
9+
"total_gz": 7648
1010
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
53704
1+
53827
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
26959
1+
27082
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
51754
1+
51877

test/test_core.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,19 @@ def decorated(self, *args, **kwargs):
339339
return outer_decorator
340340

341341

342+
def with_both_text_decoder(f):
343+
assert callable(f)
344+
345+
@wraps(f)
346+
def decorated(self, textdecoder, *args, **kwargs):
347+
self.set_setting('TEXTDECODER', textdecoder)
348+
f(self, *args, **kwargs)
349+
350+
parameterize(decorated, {'': (0,), 'textdecoder': (2,)})
351+
352+
return decorated
353+
354+
342355
no_minimal_runtime = make_no_decorator_for_setting('MINIMAL_RUNTIME')
343356
no_safe_heap = make_no_decorator_for_setting('SAFE_HEAP')
344357

@@ -5655,40 +5668,43 @@ def test_utime(self):
56555668
def test_futimens(self):
56565669
self.do_runf('utime/test_futimens.c', 'success')
56575670

5671+
@with_both_text_decoder
56585672
def test_utf(self):
56595673
self.do_core_test('test_utf.c')
56605674

5675+
@with_both_text_decoder
56615676
def test_utf32(self):
56625677
self.do_runf('utf32.cpp', 'OK (long).\n')
56635678

5679+
@with_both_text_decoder
56645680
@no_sanitize('requires libc to be built with -fshort-char')
56655681
def test_utf32_short_wchar(self):
56665682
if '-flto' in self.emcc_args or '-flto=thin' in self.emcc_args:
56675683
self.skipTest('-fshort-wchar is not compatible with LTO (libraries would need rebuilting)')
56685684
self.do_runf('utf32.cpp', 'OK (short).\n', emcc_args=['-fshort-wchar'])
56695685

5686+
@with_both_text_decoder
56705687
@crossplatform
56715688
def test_utf16(self):
56725689
self.do_runf('core/test_utf16.cpp', 'OK.')
56735690

5691+
@with_both_text_decoder
56745692
def test_utf8(self):
56755693
self.do_runf('core/test_utf8.c', 'OK.')
56765694

5695+
@with_both_text_decoder
56775696
@also_with_wasm_bigint
5678-
def test_utf8_textdecoder(self):
5697+
def test_utf8_bench(self):
56795698
self.emcc_args += ['--embed-file', test_file('utf8_corpus.txt') + '@/utf8_corpus.txt']
56805699
self.do_runf('benchmark/benchmark_utf8.c', 'OK.')
56815700

56825701
# Test that invalid character in UTF8 does not cause decoding to crash.
5702+
@with_both_text_decoder
56835703
@also_with_minimal_runtime
5684-
@parameterized({
5685-
'': ([],),
5686-
'textdecoder': (['-sTEXTDECODER'],),
5687-
})
5688-
def test_utf8_invalid(self, args):
5689-
self.do_runf('test_utf8_invalid.c', 'OK.', emcc_args=args)
5704+
def test_utf8_invalid(self):
5705+
self.do_runf('test_utf8_invalid.c', 'OK.')
56905706

5691-
def test_utf16_textdecoder(self):
5707+
def test_utf16_bench(self):
56925708
self.emcc_args += ['--embed-file', test_file('utf16_corpus.txt') + '@/utf16_corpus.txt']
56935709
self.do_runf('benchmark/benchmark_utf16.cpp', 'OK.')
56945710

0 commit comments

Comments
 (0)