Avoid creating JS symbols for symbols only used in dynamic linking #21785

sbc100 · 2024-04-18T19:01:25Z

Symbols that are exported using EMSCRIPTEN_KEEPALIVE are supposed
to be exported to the outside world (i.e. on the Module object) and
also be available to call JS within the module.

Symbols exports for the purposed of dynamic linking so not need to be
exported on the Module and are added (at runtime) to wasmImports which
acts as a kind of global symbol table for the program.

In in the case of -sMAIN_MODULE=1 we export all symbols from
all libraries, and prior to this change it was not possible to
distingish between all the exported generated because of
--export-dynamic, and the exports generated due to
EMSCRIPTEN_KEEPALIVE.

This change allows us to differentiate by running wasm-ld twice: once
without --export-dynamic (to get the smaller list of
EMSCRIPTEN_KEEPALIVE) and then once with --export-dynamic to
produce the actual wasm that we output.

This takes the list of exports that we turn in to JS globals from 7993
to 28, massively reducing the overhead of -sMAIN_MODULE=1.

sbc100 · 2024-04-18T19:04:16Z

After this change the section of the generated file that deal with exmaple export looks like this:

var ___wasm_call_ctors = () => (___wasm_call_ctors = wasmExports['__wasm_call_ctors'])();
var ___wasm_apply_data_relocs = () => (___wasm_apply_data_relocs = wasmExports['__wasm_apply_data_relocs'])();
var _main = Module['_main'] = (a0, a1) => (_main = Module['_main'] = wasmExports['__main_argc_argv'])(a0, a1);
var ___errno_location = () => (___errno_location = wasmExports['__errno_location'])();
var ___dl_seterr = (a0, a1) => (___dl_seterr = wasmExports['__dl_seterr'])(a0, a1);
var _memcpy = (a0, a1, a2) => (_memcpy = wasmExports['memcpy'])(a0, a1, a2);        
var _fileno = (a0) => (_fileno = wasmExports['fileno'])(a0);                     
var _htonl = (a0) => (_htonl = wasmExports['htonl'])(a0);                        
var _htons = (a0) => (_htons = wasmExports['htons'])(a0);                        
var _memcmp = (a0, a1, a2) => (_memcmp = wasmExports['memcmp'])(a0, a1, a2);        
var _ntohs = (a0) => (_ntohs = wasmExports['ntohs'])(a0);                        
var __emscripten_timeout = (a0, a1) => (__emscripten_timeout = wasmExports['_emscripten_timeout'])(a0, a1);
var _malloc = (a0) => (_malloc = wasmExports['malloc'])(a0);                     
var _free = (a0) => (_free = wasmExports['free'])(a0);                           
var _emscripten_builtin_memalign = (a0, a1) => (_emscripten_builtin_memalign = wasmExports['emscripten_builtin_memalign'])(a0, a1);
var _setThrew = (a0, a1) => (_setThrew = wasmExports['setThrew'])(a0, a1);          
var __emscripten_tempret_set = (a0) => (__emscripten_tempret_set = wasmExports['_emscripten_tempret_set'])(a0);
var __emscripten_tempret_get = () => (__emscripten_tempret_get = wasmExports['_emscripten_tempret_get'])();
var ___get_temp_ret = () => (___get_temp_ret = wasmExports['__get_temp_ret'])(); 
var ___set_temp_ret = (a0) => (___set_temp_ret = wasmExports['__set_temp_ret'])(a0);
var __emscripten_stack_restore = (a0) => (__emscripten_stack_restore = wasmExports['_emscripten_stack_restore'])(a0);
var __emscripten_stack_alloc = (a0) => (__emscripten_stack_alloc = wasmExports['_emscripten_stack_alloc'])(a0);
var _emscripten_stack_get_current = () => (_emscripten_stack_get_current = wasmExports['emscripten_stack_get_current'])();
var ___cxa_demangle = (a0, a1, a2, a3) => (___cxa_demangle = wasmExports['__cxa_demangle'])(a0, a1, a2, a3);
var ___cxa_increment_exception_refcount = (a0) => (___cxa_increment_exception_refcount = wasmExports['__cxa_increment_exception_refcount'])(a0);
var ___cxa_decrement_exception_refcount = (a0) => (___cxa_decrement_exception_refcount = wasmExports['__cxa_decrement_exception_refcount'])(a0);
var ___cxa_can_catch = (a0, a1, a2) => (___cxa_can_catch = wasmExports['__cxa_can_catch'])(a0, a1, a2);
var ___cxa_is_pointer_type = (a0) => (___cxa_is_pointer_type = wasmExports['__cxa_is_pointer_type'])(a0);

Prior to this change there are 7993 lines like this:

var ___wasm_call_ctors = () => (___wasm_call_ctors = wasmExports['__wasm_call_ctors'])();
var ___wasm_apply_data_relocs = () => (___wasm_apply_data_relocs = wasmExports['__wasm_apply_data_relocs'])();
var _main = Module['_main'] = (a0, a1) => (_main = Module['_main'] = wasmExports['__main_argc_argv'])(a0, a1);
var _puts = Module['_puts'] = (a0) => (_puts = Module['_puts'] = wasmExports['puts'])(a0);
var _emscripten_GetProcAddress = Module['_emscripten_GetProcAddress'] = (a0) => (_emscripten_GetProcAddress = Module['_emscripten_GetProcAddress'] = wasmExports['emscripten_GetProcAddress'])(a0);
var _strlen = Module['_strlen'] = (a0) => (_strlen = Module['_strlen'] = wasmExports['strlen'])(a0);
var _malloc = (a0) => (_malloc = wasmExports['malloc'])(a0);   
...
ar _orig$_ZNKSt3__24__fs10filesystem6detail12ErrorHandlerIyE6reportERKNS_10error_codeE = Module['_orig$_ZNKSt3__24__fs10filesystem6detail12ErrorHandlerIyE6reportERKNS_10error_codeE'] = (a0, a1) => (_orig$_ZNKSt3__24__fs10filesystem6detail12ErrorHandlerIyE6reportERKNS_10error_codeE = Module['_orig$_ZNKSt3__24__fs10filesystem6detail12ErrorHandlerIyE6reportERKNS_10error_codeE'] = wasmExports['orig$_ZNKSt3__24__fs10filesystem6detail12ErrorHandlerIyE6reportERKNS_10error_codeE'])(a0, a1);
var _orig$_ZNSt3__24__fs10filesystem6detail11error_valueIyEET_v = Module['_orig$_ZNSt3__24__fs10filesystem6detail11error_valueIyEET_v'] = () => (_orig$_ZNSt3__24__fs10filesystem6detail11error_valueIyEET_v = Module['_orig$_ZNSt3__24__fs10filesystem6detail11error_valueIyEET_v'] = wasmExports['orig$_ZNSt3__24__fs10filesystem6detail11error_valueIyEET_v'])();
var _orig$_ZNSt3__24__fs10filesystem17__hard_link_countERKNS1_4pathEPNS_10error_codeE = Module['_orig$_ZNSt3__24__fs10filesystem17__hard_link_countERKNS1_4pathEPNS_10error_codeE'] = (a0, a1) => (_orig$_ZNSt3__24__fs10filesystem17__hard_link_countERKNS1_4pathEPNS_10error_codeE = Module['_orig$_ZNSt3__24__fs10filesystem17__hard_link_countERKNS1_4pathEPNS_10error_codeE'] = wasmExports['orig$_ZNSt3__24__fs10filesystem17__hard_link_countERKNS1_4pathEPNS_10error_codeE'])(a0, a1);
var _orig$_ZNSt3__24__fs10filesystem17__last_write_timeERKNS1_4pathENS_6chrono10time_pointINS1_16_FilesystemClockENS5_8durationInNS_5ratioILx1ELx1000000000EEEEEEEPNS_10error_codeE = Module['_orig$_ZNSt3__24__fs10filesystem17__last_write_timeERKNS1_4pathENS_6chrono10time_pointINS1_16_FilesystemClockENS5_8durationInNS_5ratioILx1ELx1000000000EEEEEEEPNS_10error_codeE'] = (a0, a1, a2, a3) => (_orig$_ZNSt3__24__fs10filesystem17__last_write_timeERKNS1_4pathENS_6chrono10time_pointINS1_16_FilesystemClockENS5_8durationInNS_5ratioILx1ELx1000000000EEEEEEEPNS_10error_codeE = Module['_orig$_ZNSt3__24__fs10filesystem17__last_write_timeERKNS1_4pathENS_6chrono10time_pointINS1_16_FilesystemClockENS5_8durationInNS_5ratioILx1ELx1000000000EEEEEEEPNS_10error_codeE'] = wasmExports['orig$_ZNSt3__24__fs10filesystem17__last_write_timeERKNS1_4pathENS_6chrono10time_pointINS1_16_FilesystemClockENS5_8durationInNS_5ratioILx1ELx1000000000EEEEEEEPNS_10error_codeE'])(a0, a1, a2, a3);
var _orig$_ZNSt3__24__fs10filesystem6detail9time_utilINS_6chrono10time_pointINS1_16_FilesystemClockENS4_8durationInNS_5ratioILx1ELx1000000000EEEEEEEx8timespecE19convert_to_timespecERSC_SB_ = Module['_orig$_ZNSt3__24__fs10filesystem6detail9time_utilINS_6chrono10time_pointINS1_16_FilesystemClockENS4_8durationInNS_5ratioILx1ELx1000000000EEEEEEEx8timespecE19convert_to_timespecERSC_SB_'] = (a0, a1, a2) => (_orig$_ZNSt3__24__fs10filesystem6detail9time_utilINS_6chrono10time_pointINS1_16_FilesystemClockENS4_8durationInNS_5ratioILx1ELx1000000000EEEEEEEx8timespecE19convert_to_timespecERSC_SB_ = Module['_orig$_ZNSt3__24__fs10filesystem6detail9time_utilINS_6chrono10time_pointINS1_16_FilesystemClockENS4_8durationInNS_5ratioILx1ELx1000000000EEEEEEEx8timespecE19convert_to_timespecERSC_SB_'] = wasmExports['orig$_ZNSt3__24__fs10filesystem6detail9time_utilINS_6chrono10time_pointINS1_16_FilesystemClockENS4_8durationInNS_5ratioILx1ELx1000000000EEEEEEEx8timespecE19convert_to_timespecERSC_SB_'])(a0, a1, a2);

Note that not only does it include all these extra symbols, including the very long C++ ones, but that these lines cannot be DCE's because they are exported on the Module object.

sbc100 · 2024-04-18T19:05:32Z

My alternative reason for wanting to do this is to avoid -sMAIN_MODULE=1 polluting the namespace with all the symbols in libc and libc++.. I'm working on change to remove the leading underscore mangling and if we export all symbols we get err from musl exported which conflicts with the err we define in JS.

sbc100 · 2024-04-18T19:12:39Z

Depends on #21784

kripken · 2024-04-18T19:43:42Z

What is the cost of running wasm-ld twice? It seems like in LTO mode that could be very expensive. If so perhaps we can add a flag to wasm-ld to emit the information we need in a single run?

sbc100 · 2024-04-18T20:18:59Z

What is the cost of running wasm-ld twice? It seems like in LTO mode that could be very expensive. If so perhaps we can add a flag to wasm-ld to emit the information we need in a single run?

I added a TODO regarding that. Brenden and I have been looking into ways to allow the linker to preserve this information, but its non-trivial.

The cost of wasm-ld is generally low (compared to say wasm-opt), but you are right that LTO is the exception. However, I think this change is still worth to since it reduces code size my so much, and folks doing LTO are already opting into slowing link times in the hope of getting smaller binaries.

Also, remember that this only effects -sMAIN_MODULE=1 users, and that is really not a recommended configuration.

sbc100 · 2024-04-18T20:19:41Z

Ooops, I noticed that I forgot to upload the patch with the TODO in it.

sbc100 · 2024-04-18T20:20:53Z

The code size saving here is pretty huge for the JS file: 3.5mb -> 0.5mb:

3380583 old.js
1683514 old.wasm
 555139 new.js
1683514 new.wasm

sbc100 · 2024-04-18T20:29:42Z

Actually the impact of link time, even with LTO, is not so great, since the first link is done without --export-dynamic which is much faster:

profiler:INFO:   block "JS symbol generation" took 0.012 seconds
profiler:INFO:   start block "link"
profiler:INFO:     start block "link_lld"
profiler:INFO:     block "link_lld" took 1.542 seconds
profiler:INFO:     start block "link_lld"
profiler:INFO:     block "link_lld" took 12.074 seconds
profiler:INFO:   block "link" took 13.619 seconds

Here you can see the actual link take 12 seconds and the "pre-link" that I do only takes 1.5 (i'm assuming since its only compiling what is needed by the core set of exports and the EMSCRIPTEN_KEEPALIVE functions).

sbc100 · 2024-04-18T20:36:24Z

This change should be much easier to review now that #21784 has landed.

I think it make -sMAIN_MODULE=1 output slightly more practical.. even though this mode still produces huge binaries.

tools/link.py

sbc100 · 2024-04-18T21:03:53Z

Updated. I even remembered to use a separate commit for the feedback !

kripken · 2024-04-18T21:05:06Z

Here you can see the actual link take 12 seconds and the "pre-link" that I do only takes 1.5 (i'm assuming since its only compiling what is needed by the core set of exports and the EMSCRIPTEN_KEEPALIVE functions).

In your example I'm guessing most of the code could be optimized out? But imagine a large game engine where most of it ends up in the final binary either way (tons of indirect calls, little can be DCE'd). Or, imagine one of the various corner cases that cause very slow LTO, like optimizing SQLite for example (unless LLVM fixed that) - it was almost a minute when I last measured it iirc, for just one function.

This change would double such potentially long LTO times, which worries me.

I guess an argument could be that LTO is meant to be slow anyhow, and we can tolerate additional slowdowns for better code, which this provides. But a potentially 2x slowdown to the llvm LTO part still seems quite steep compared to other slowdowns we usually consider. I wonder if we can mitigate or document it somehow.

sbc100 · 2024-04-18T21:07:23Z

Here you can see the actual link take 12 seconds and the "pre-link" that I do only takes 1.5 (i'm assuming since its only compiling what is needed by the core set of exports and the EMSCRIPTEN_KEEPALIVE functions).

In your example I'm guessing most of the code could be optimized out? But imagine a large game engine where most of it ends up in the final binary either way (tons of indirect calls, little can be DCE'd). Or, imagine one of the various corner cases that cause very slow LTO, like optimizing SQLite for example (unless LLVM fixed that) - it was almost a minute when I last measured it iirc, for just one function.

This change would double such potentially long LTO times, which worries me.

I guess an argument could be that LTO is meant to be slow anyhow, and we can tolerate additional slowdowns for better code, which this provides. But a potentially 2x slowdown to the llvm LTO part still seems quite steep compared to other slowdowns we usually consider. I wonder if we can mitigate or document it somehow.

But I don't think there are many users of -sMAIN_MODULE=1 .. and if there are I think they would take this tradeoff, don't you? 6 x smaller JS file in exchange for slower link time?

sbc100 · 2024-04-18T21:08:54Z

I'll write the ChangeLog entry regarding this.. we can revisit if we get reports of folks being negatively effected.

(And hopefully we can fix the TODO when we get time regardless)

kripken · 2024-04-18T23:13:55Z

Yeah, makes sense that such a big code size win might be worth this risk. But I agree that documenting this is safer, as then it's easier for people (and us) to triage future reports of "LTO is slow in this release".

Symbols that are exported using EMSCRIPTEN_KEEPALIVE are supposed to be exported to the outside world (i.e. on the Module object) and also be available to call JS within the module. Symbols exports for the purposed of dynamic linking so not need to be exported on the Module and are added (at runtime) to `wasmImports` which acts as a kind of global symbol table for the program. In in the case of `-sMAIN_MODULE=1` we export *all* symbols from all libraries, and prior to this change it was not possible to distingish between all the exported generated because of `--export-dynamic`, and the exports generated due to `EMSCRIPTEN_KEEPALIVE`. This change allows us to differentiate by running `wasm-ld` twice: once without `--export-dynamic` (to get the smaller list of `EMSCRIPTEN_KEEPALIVE`) and then once with `--export-dynamic` to produce the actual wasm that we output. This takes the list of exports that we turn in to JS globals from 7993 to 28, massively reducing the overhead of `-sMAIN_MODULE=1`.

msqr1 · 2024-04-21T04:29:06Z

Just curious, what is the difference between this and MAIN_MODULE=2?

sbc100 · 2024-04-21T15:21:46Z

-sMAIN_MODULE=2 does DCE (dead code elimination). There are some docs here: https://emscripten.org/docs/compiling/Dynamic-Linking.html#code-size

sbc100 requested a review from brendandahl April 18, 2024 19:05

sbc100 force-pushed the avoid_main_module_exports branch from 0bfea47 to 216552f Compare April 18, 2024 19:12

sbc100 requested a review from kripken April 18, 2024 19:12

sbc100 force-pushed the avoid_main_module_exports branch from 216552f to 5dd1967 Compare April 18, 2024 20:19

sbc100 force-pushed the avoid_main_module_exports branch from 5dd1967 to fcb1066 Compare April 18, 2024 20:34

kripken reviewed Apr 18, 2024

View reviewed changes

tools/link.py Show resolved Hide resolved

tools/link.py Outdated Show resolved Hide resolved

tools/link.py Outdated Show resolved Hide resolved

tools/link.py Outdated Show resolved Hide resolved

sbc100 force-pushed the avoid_main_module_exports branch from fcb1066 to f2db899 Compare April 18, 2024 21:03

sbc100 closed this Apr 18, 2024

sbc100 reopened this Apr 18, 2024

sbc100 force-pushed the avoid_main_module_exports branch from ceacfdf to 6f1a769 Compare April 18, 2024 22:03

kripken approved these changes Apr 18, 2024

View reviewed changes

sbc100 force-pushed the avoid_main_module_exports branch 2 times, most recently from edeb8a4 to a66d170 Compare April 19, 2024 22:21

sbc100 added 3 commits April 20, 2024 15:03

feedback

114247c

update changelog

77c5810

sbc100 added 2 commits April 20, 2024 15:03

fix test

f9fc346

fix

61f9a9e

sbc100 force-pushed the avoid_main_module_exports branch from a66d170 to 61f9a9e Compare April 20, 2024 22:06

sbc100 enabled auto-merge (squash) April 20, 2024 23:17

sbc100 merged commit 11bd2d9 into emscripten-core:main Apr 20, 2024
29 checks passed

sbc100 deleted the avoid_main_module_exports branch April 20, 2024 23:17

hoodmane mentioned this pull request Apr 26, 2024

Question: -sMAIN_MODULE=1 and -sEXPORTED_FUNCTIONS warning no longer valid? #21838

Open

msorvig mentioned this pull request May 2, 2024

Linking with -s MAXIMUM_MEMORY=4GB and -s MODULARIZE=1 is very slow. #21796

Open

hoodmane mentioned this pull request Jun 8, 2024

FIX Fill in GOT entries when libs are loaded with allowUndefined #22053

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Avoid creating JS symbols for symbols only used in dynamic linking #21785

Avoid creating JS symbols for symbols only used in dynamic linking #21785

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

kripken commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

kripken commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

kripken commented Apr 18, 2024

msqr1 commented Apr 21, 2024

sbc100 commented Apr 21, 2024

Avoid creating JS symbols for symbols only used in dynamic linking #21785

Avoid creating JS symbols for symbols only used in dynamic linking #21785

Conversation

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

kripken commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

kripken commented Apr 18, 2024

sbc100 commented Apr 18, 2024

sbc100 commented Apr 18, 2024

kripken commented Apr 18, 2024

msqr1 commented Apr 21, 2024

sbc100 commented Apr 21, 2024