From 9faaa66e8b94e783c46898552c3eb873c26858ff Mon Sep 17 00:00:00 2001 From: Eduardo Bart Date: Wed, 15 Nov 2023 17:32:33 -0300 Subject: [PATCH] Optimize coroutine implementation to use less physical memory --- lib/detail/minicoro.nelua | 204 ++++++++++++++++++++++++++------------ 1 file changed, 138 insertions(+), 66 deletions(-) diff --git a/lib/detail/minicoro.nelua b/lib/detail/minicoro.nelua index 23177894..bdb89c19 100644 --- a/lib/detail/minicoro.nelua +++ b/lib/detail/minicoro.nelua @@ -33,8 +33,9 @@ local minicoro.Coro: type = @record{ func: function(*minicoro.Coro): void, prev_co: *minicoro.Coro, user_data: pointer, + coro_size: csize, allocator_data: pointer, - free_cb: function(pointer, pointer): void, + dealloc_cb: function(pointer, csize, pointer): void, stack_base: pointer, stack_size: csize, storage: *cuchar, @@ -48,8 +49,8 @@ local minicoro.Coro: type = @record{ local minicoro.Desc: type = @record{ func: function(*minicoro.Coro): void, user_data: pointer, - malloc_cb: function(csize, pointer): pointer, - free_cb: function(pointer, pointer): void, + alloc_cb: function(csize, pointer): pointer, + dealloc_cb: function(pointer, csize, pointer): void, allocator_data: pointer, storage_size: csize, coro_size: csize, @@ -79,12 +80,15 @@ local minicoro_heading = [==[ #define MCO_LOG(x) #define MCO_API static #define MCO_NO_DEBUG +#if defined(__linux) || defined(_WIN32) +#define MCO_USE_VMEM_ALLOCATOR +#endif #define MCO_DEFAULT_STACK_SIZE 256*1024 ]==] local minicoro_code = minicoro_heading..[==[ /* Minimal asymmetric stackful cross-platform coroutine library in pure C. -minicoro - v0.1.3 - 27/Jan/2022 +minicoro - v0.2.0 - 15/Nov/2023 Eduardo Bart - edub4rt@gmail.com https://github.com/edubart/minicoro @@ -98,6 +102,7 @@ The API is inspired by Lua coroutines but with C use in mind. - Supports custom allocators. - Storage system to allow passing values between yield and resume. - Customizable stack size. +- Supports growable stacks and low memory footprint when enabling the virtual memory allocator. - Coroutine API design inspired by Lua with C use in mind. - Yield across any C function. - Made to work in multithread applications. @@ -133,12 +138,13 @@ to create, resume, yield or destroy a coroutine. # Caveats -- Don't use coroutines with C++ exceptions, this is not supported. +- Avoid using coroutines with C++ exceptions, this is not recommended, it may not behave as you expect. - When using C++ RAII (i.e. destructors) you must resume the coroutine until it dies to properly execute all destructors. -- To use in multithread applications, you must compile with C compiler that supports `thread_local` qualifier. - Some unsupported sanitizers for C may trigger false warnings when using coroutines. -- The `mco_coro` object is not thread safe, you should lock each coroutine into a thread. -- Stack space is fixed, it cannot grow. By default it has about 56KB of space, this can be changed on coroutine creation. +- The `mco_coro` object is not thread safe, you should use a mutex for manipulating it in multithread applications. +- To use in multithread applications, you must compile with C compiler that supports `thread_local` qualifier. +- Avoid using `thread_local` inside coroutine code, the compiler may cache thread local variables pointers which can be invalid when a coroutine switch threads. +- Stack space is limited. By default it has 56KB of space, this can be changed on coroutine creation, or by enabling the virtual memory backed allocator to make it 2040KB. - Take care to not cause stack overflows (run out of stack space), otherwise your program may crash or not, the behavior is undefined. - On WebAssembly you must compile with Emscripten flag `-s ASYNCIFY=1`. - The WebAssembly Binaryen asyncify method can be used when explicitly enabled, @@ -192,6 +198,7 @@ The following simple example demonstrates on how to use the library: #define MINICORO_IMPL #include "minicoro.h" #include +#include // Coroutine entry function. void coro_entry(mco_coro* co) { @@ -252,21 +259,61 @@ an error. The library return error codes in most of its API in case of misuse or system error, the user is encouraged to handle them properly. +## Virtual memory backed allocator + +The new compile time option `MCO_USE_VMEM_ALLOCATOR` enables a virtual memory backed allocator. + +Every stackful coroutine usually have to reserve memory for its full stack, +this typically makes the total memory usage very high when allocating thousands of coroutines, +for example, an application with 100 thousands coroutine with stacks of 56KB would consume as high +as 5GB of memory, however your application may not really full stack usage for every coroutine. + +Some developers often prefer stackless coroutines over stackful coroutines +because of this problem, stackless memory footprint is low, therefore often considered more lightweight. +However stackless have many other limitations, like you cannot run unconstrained code inside them. + +One remedy to the solution is to make stackful coroutines growable, +to only use physical memory on demand when its really needed, +and there is a nice way to do this relying on virtual memory allocation +when supported by the operating system. + +The virtual memory backed allocator will reserve virtual memory in the OS for each coroutine stack, +but not trigger real physical memory usage yet. +While the application virtual memory usage will be high, +the physical memory usage will be low and actually grow on demand (usually every 4KB chunk in Linux). + +The virtual memory backed allocator also raises the default stack size to about 2MB, +typically the size of extra threads in Linux, +so you have more space in your coroutines and the risk of stack overflow is low. + +As an example, allocating 100 thousands coroutines with nearly 2MB stack reserved space +with the virtual memory allocator uses 783MB of physical memory usage, that is about 8KB per coroutine, +however the virtual memory usage will be at 98GB. + +It is recommended to enable this option only if you plan to spawn thousands of coroutines +while wanting to have a low memory footprint. +Not all environments have an OS with virtual memory support, therefore this option is disabled by default. + +This option may add an order of magnitude overhead to `mco_create()`/`mco_destroy()`, +because they will request the OS to manage virtual memory page tables, +if this is a problem for you, please customize a custom allocator for your own needs. + ## Library customization The following can be defined to change the library behavior: - `MCO_API` - Public API qualifier. Default is `extern`. -- `MCO_MIN_STACK_SIZE` - Minimum stack size when creating a coroutine. Default is 32768. +- `MCO_MIN_STACK_SIZE` - Minimum stack size when creating a coroutine. Default is 32768 (32KB). - `MCO_DEFAULT_STORAGE_SIZE` - Size of coroutine storage buffer. Default is 1024. -- `MCO_DEFAULT_STACK_SIZE` - Default stack size when creating a coroutine. Default is 57344. -- `MCO_MALLOC` - Default allocation function. Default is `malloc`. -- `MCO_FREE` - Default deallocation function. Default is `free`. +- `MCO_DEFAULT_STACK_SIZE` - Default stack size when creating a coroutine. Default is 57344 (56KB). When `MCO_USE_VMEM_ALLOCATOR` is true the default is 2040KB (nearly 2MB). +- `MCO_ALLOC` - Default allocation function. Default is `calloc`. +- `MCO_DEALLOC` - Default deallocation function. Default is `free`. +- `MCO_USE_VMEM_ALLOCATOR` - Use virtual memory backed allocator, improving memory footprint per coroutine. +- `MCO_NO_DEFAULT_ALLOCATOR` - Disable the default allocator using `MCO_ALLOC` and `MCO_DEALLOC`. +- `MCO_ZERO_MEMORY` - Zero memory of stack when poping storage, intended for garbage collected environments. - `MCO_DEBUG` - Enable debug mode, logging any runtime error to stdout. Defined automatically unless `NDEBUG` or `MCO_NO_DEBUG` is defined. - `MCO_NO_DEBUG` - Disable debug mode. - `MCO_NO_MULTITHREAD` - Disable multithread usage. Multithread is supported when `thread_local` is supported. -- `MCO_NO_DEFAULT_ALLOCATORS` - Disable the default allocator using `MCO_MALLOC` and `MCO_FREE`. -- `MCO_ZERO_MEMORY` - Zero memory of stack for new coroutines and when poping storage, intended for garbage collected environments. - `MCO_USE_ASM` - Force use of assembly context switch implementation. - `MCO_USE_UCONTEXT` - Force use of ucontext context switch implementation. - `MCO_USE_FIBERS` - Force use of fibers context switch implementation. @@ -322,7 +369,7 @@ typedef enum mco_result { MCO_OUT_OF_MEMORY, MCO_INVALID_ARGUMENTS, MCO_INVALID_OPERATION, - MCO_STACK_OVERFLOW, + MCO_STACK_OVERFLOW } mco_result; /* Coroutine structure. */ @@ -333,8 +380,9 @@ struct mco_coro { void (*func)(mco_coro* co); mco_coro* prev_co; void* user_data; + size_t coro_size; void* allocator_data; - void (*free_cb)(void* ptr, void* allocator_data); + void (*dealloc_cb)(void* ptr, size_t size, void* allocator_data); void* stack_base; /* Stack base address, can be used to scan memory in a garbage collector. */ size_t stack_size; unsigned char* storage; @@ -351,9 +399,9 @@ typedef struct mco_desc { void (*func)(mco_coro* co); /* Entry point function for the coroutine. */ void* user_data; /* Coroutine user data, can be get with `mco_get_user_data`. */ /* Custom allocation interface. */ - void* (*malloc_cb)(size_t size, void* allocator_data); /* Custom allocation function. */ - void (*free_cb)(void* ptr, void* allocator_data); /* Custom deallocation function. */ - void* allocator_data; /* User data pointer passed to `malloc`/`free` allocation functions. */ + void* (*alloc_cb)(size_t size, void* allocator_data); /* Custom allocation function. */ + void (*dealloc_cb)(void* ptr, size_t size, void* allocator_data); /* Custom deallocation function. */ + void* allocator_data; /* User data pointer passed to `alloc`/`dealloc` allocation functions. */ size_t storage_size; /* Coroutine storage size, to be used with the storage APIs. */ /* These must be initialized only through `mco_init_desc`. */ size_t coro_size; /* Coroutine structure size. */ @@ -403,7 +451,12 @@ extern "C" { /* Default stack size when creating a coroutine. */ #ifndef MCO_DEFAULT_STACK_SIZE -#define MCO_DEFAULT_STACK_SIZE 57344 /* Don't use multiples of 64K to avoid D-cache aliasing conflicts. */ +/* Use multiples of 64KB minus 8KB, because 8KB is reserved for coroutine internal structures. */ +#ifdef MCO_USE_VMEM_ALLOCATOR +#define MCO_DEFAULT_STACK_SIZE 2040*1024 /* 2040KB, nearly the same stack size of a thread in x86_64 Linux. */ +#else +#define MCO_DEFAULT_STACK_SIZE 56*1024 /* 56KB */ +#endif #endif /* Number used only to assist checking for stack overflows. */ @@ -508,21 +561,61 @@ extern "C" { #endif #endif -#ifndef MCO_NO_DEFAULT_ALLOCATORS -#ifndef MCO_MALLOC - #include - #define MCO_MALLOC malloc - #define MCO_FREE free +#if defined(_WIN32) && (defined(MCO_USE_FIBERS) || defined(MCO_USE_VMEM_ALLOCATOR)) + #ifndef _WIN32_WINNT + #define _WIN32_WINNT 0x0400 + #endif + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN + #endif + #include #endif -static void* mco_malloc(size_t size, void* allocator_data) { - _MCO_UNUSED(allocator_data); - return MCO_MALLOC(size); -} -static void mco_free(void* ptr, void* allocator_data) { - _MCO_UNUSED(allocator_data); - MCO_FREE(ptr); -} -#endif /* MCO_NO_DEFAULT_ALLOCATORS */ + +#ifndef MCO_NO_DEFAULT_ALLOCATOR + #if defined(MCO_USE_VMEM_ALLOCATOR) && defined(_WIN32) + static void* mco_alloc(size_t size, void* allocator_data) { + _MCO_UNUSED(allocator_data); + return VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + } + static void mco_dealloc(void* ptr, size_t size, void* allocator_data) { + _MCO_UNUSED(allocator_data); + _MCO_UNUSED(size); + int res = VirtualFree(ptr, 0, MEM_RELEASE); + _MCO_UNUSED(res); + MCO_ASSERT(res != 0); + } + #elif defined(MCO_USE_VMEM_ALLOCATOR) /* POSIX virtual memory allocator */ + #include + static void* mco_alloc(size_t size, void* allocator_data) { + _MCO_UNUSED(allocator_data); + void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + return ptr != MAP_FAILED ? ptr : NULL; + } + static void mco_dealloc(void* ptr, size_t size, void* allocator_data) { + _MCO_UNUSED(allocator_data); + int res = munmap(ptr, size); + _MCO_UNUSED(res); + MCO_ASSERT(res == 0); + } + #else /* C allocator */ + #ifndef MCO_ALLOC + #include + /* We use calloc() so we give a chance for the OS to reserve virtual memory without really using physical memory, + calloc() also has the nice property of initializing the stack to zeros. */ + #define MCO_ALLOC(size) calloc(1, size) + #define MCO_DEALLOC(ptr, size) free(ptr) + #endif + static void* mco_alloc(size_t size, void* allocator_data) { + _MCO_UNUSED(allocator_data); + return MCO_ALLOC(size); + } + static void mco_dealloc(void* ptr, size_t size, void* allocator_data) { + _MCO_UNUSED(size); + _MCO_UNUSED(allocator_data); + MCO_DEALLOC(ptr, size); + } + #endif /* MCO_USE_VMEM_ALLOCATOR */ +#endif /* MCO_NO_DEFAULT_ALLOCATOR */ #if defined(__has_feature) #if __has_feature(address_sanitizer) @@ -1331,13 +1424,9 @@ static mco_result _mco_create_context(mco_coro* co, mco_desc* desc) { memset(context, 0, sizeof(_mco_context)); /* Initialize storage. */ unsigned char* storage = (unsigned char*)storage_addr; - memset(storage, 0, desc->storage_size); /* Initialize stack. */ void *stack_base = (void*)stack_addr; size_t stack_size = desc->stack_size; -#ifdef MCO_ZERO_MEMORY - memset(stack_base, 0, stack_size); -#endif /* Make the context. */ mco_result res = _mco_makectx(co, &context->ctx, stack_base, stack_size); if(res != MCO_SUCCESS) { @@ -1382,14 +1471,6 @@ static MCO_FORCE_INLINE void _mco_init_desc_sizes(mco_desc* desc, size_t stack_s #ifdef _WIN32 -#ifndef _WIN32_WINNT -#define _WIN32_WINNT 0x0400 -#endif -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include - typedef struct _mco_context { void* fib; void* back_fib; @@ -1443,7 +1524,6 @@ static mco_result _mco_create_context(mco_coro* co, mco_desc* desc) { memset(context, 0, sizeof(_mco_context)); /* Initialize storage. */ unsigned char* storage = (unsigned char*)storage_addr; - memset(storage, 0, desc->storage_size); /* Create the fiber. */ _mco_fiber* fib = (_mco_fiber*)CreateFiberEx(desc->stack_size, desc->stack_size, FIBER_FLAG_FLOAT_SWITCH, _mco_wrap_main, co); if(!fib) { @@ -1532,16 +1612,11 @@ static mco_result _mco_create_context(mco_coro* co, mco_desc* desc) { memset(context, 0, sizeof(_mco_context)); /* Initialize storage. */ unsigned char* storage = (unsigned char*)storage_addr; - memset(storage, 0, desc->storage_size); /* Initialize stack. */ void *stack_base = (void*)stack_addr; size_t stack_size = asyncify_stack_addr - stack_addr; void *asyncify_stack_base = (void*)asyncify_stack_addr; size_t asyncify_stack_size = co_addr + desc->coro_size - asyncify_stack_addr; -#ifdef MCO_ZERO_MEMORY - memset(stack_base, 0, stack_size); - memset(asyncify_stack_base, 0, asyncify_stack_size); -#endif /* Create the fiber. */ emscripten_fiber_init(&context->fib, _mco_wrap_main, co, stack_base, stack_size, asyncify_stack_base, asyncify_stack_size); co->context = context; @@ -1639,13 +1714,9 @@ static mco_result _mco_create_context(mco_coro* co, mco_desc* desc) { memset(context, 0, sizeof(_mco_context)); /* Initialize storage. */ unsigned char* storage = (unsigned char*)storage_addr; - memset(storage, 0, desc->storage_size); /* Initialize stack. */ void *stack_base = (void*)stack_addr; size_t stack_size = desc->stack_size; -#ifdef MCO_ZERO_MEMORY - memset(stack_base, 0, stack_size); -#endif context->stack_region.start = stack_base; context->stack_region.limit = (void*)((size_t)stack_base + stack_size); co->context = context; @@ -1686,10 +1757,10 @@ mco_desc mco_desc_init(void (*func)(mco_coro* co), size_t stack_size) { stack_size = _mco_align_forward(stack_size, 16); /* Stack size should be aligned to 16 bytes. */ mco_desc desc; memset(&desc, 0, sizeof(mco_desc)); -#ifndef MCO_NO_DEFAULT_ALLOCATORS +#ifndef MCO_NO_DEFAULT_ALLOCATOR /* Set default allocators. */ - desc.malloc_cb = mco_malloc; - desc.free_cb = mco_free; + desc.alloc_cb = mco_alloc; + desc.dealloc_cb = mco_dealloc; #endif desc.func = func; desc.storage_size = MCO_DEFAULT_STORAGE_SIZE; @@ -1732,7 +1803,8 @@ mco_result mco_init(mco_coro* co, mco_desc* desc) { if(res != MCO_SUCCESS) return res; co->state = MCO_SUSPENDED; /* We initialize in suspended state. */ - co->free_cb = desc->free_cb; + co->dealloc_cb = desc->dealloc_cb; + co->coro_size = desc->coro_size; co->allocator_data = desc->allocator_data; co->func = desc->func; co->user_data = desc->user_data; @@ -1771,13 +1843,13 @@ mco_result mco_create(mco_coro** out_co, mco_desc* desc) { MCO_LOG("coroutine output pointer is NULL"); return MCO_INVALID_POINTER; } - if(!desc || !desc->malloc_cb || !desc->free_cb) { + if(!desc || !desc->alloc_cb || !desc->dealloc_cb) { *out_co = NULL; MCO_LOG("coroutine allocator description is not set"); return MCO_INVALID_ARGUMENTS; } /* Allocate the coroutine. */ - mco_coro* co = (mco_coro*)desc->malloc_cb(desc->coro_size, desc->allocator_data); + mco_coro* co = (mco_coro*)desc->alloc_cb(desc->coro_size, desc->allocator_data); if(!co) { MCO_LOG("coroutine allocation failed"); *out_co = NULL; @@ -1786,7 +1858,7 @@ mco_result mco_create(mco_coro** out_co, mco_desc* desc) { /* Initialize the coroutine. */ mco_result res = mco_init(co, desc); if(res != MCO_SUCCESS) { - desc->free_cb(co, desc->allocator_data); + desc->dealloc_cb(co, desc->coro_size, desc->allocator_data); *out_co = NULL; return res; } @@ -1804,11 +1876,11 @@ mco_result mco_destroy(mco_coro* co) { if(res != MCO_SUCCESS) return res; /* Free the coroutine. */ - if(!co->free_cb) { + if(!co->dealloc_cb) { MCO_LOG("attempt destroy a coroutine that has no free callback"); return MCO_INVALID_POINTER; } - co->free_cb(co, co->allocator_data); + co->dealloc_cb(co, co->coro_size, co->allocator_data); return MCO_SUCCESS; } @@ -2030,7 +2102,7 @@ For more information, please refer to =============================================================================== ALTERNATIVE 2 - MIT No Attribution =============================================================================== -Copyright (c) 2021-2022 Eduardo Bart (https://github.com/edubart/minicoro) +Copyright (c) 2021-2023 Eduardo Bart (https://github.com/edubart/minicoro) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in