Skip to content

Commit

Permalink
Completed benchmarks. Should now be ready to write the WG14 paper.
Browse files Browse the repository at this point in the history
  • Loading branch information
ned14 committed Feb 21, 2025
1 parent 23d2a1f commit 24bdc4d
Show file tree
Hide file tree
Showing 11 changed files with 449 additions and 20 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:

- name: CMake tests Linux
shell: bash
run: cd build && ctest --output-on-failure --timeout 300
run: cd build && ctest --output-on-failure --timeout 300 -E benchmark

MacOS:
name: Mac OS
Expand Down Expand Up @@ -68,7 +68,7 @@ jobs:

- name: CMake tests Mac OS
shell: bash
run: cd build && ctest --output-on-failure --timeout 300
run: cd build && ctest --output-on-failure --timeout 300 -E benchmark

Windows:
name: Windows VS2022
Expand Down Expand Up @@ -97,4 +97,4 @@ jobs:

- name: CMake tests WinVS2022
shell: bash
run: cd build && ctest -C ${{ matrix.debug }} --output-on-failure --timeout 300
run: cd build && ctest -C ${{ matrix.debug }} --output-on-failure --timeout 300 -E benchmark
36 changes: 34 additions & 2 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,38 @@ loaded.
and so the fallback hash table is always used on those platforms, which
is unfortunate.
## Todo
## Performance
- Benchmark everything so can quote perf in WG14 paper.
On my Threadripper 5975WX which is a 3.6Ghz processor bursting to 4.5Ghz
on Linux:
- `tss_async_signal_safe_get()` which implements an async signal safe
thread local storage using a hash table costs about 29 nanoseconds, so
maybe 130 clock cycles.
With `WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL=1` (the default on Linux,
Windows, and other ELF based platforms):
- `thrd_signal_invoke()` which invokes a function which thread locally
handles any signals raised costs about 31 nanoseconds (140 clock cycles)
for the happy case (most of this is the cost of `_setjmp()` on this platform
and glibc).
- A globally installed signal decider takes about 29 nanoseconds (130
clock cycles) to reach (there is a CAS lock-unlock sequence needed).
With `WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL=0` (the default on Mac OS):
- `thrd_signal_invoke()` which invokes a function which thread locally
handles any signals raised costs about 45 nanoseconds for the happy case.
- A globally installed signal decider takes about 55 nanoseconds to reach.
# Todo
- Global signal deciders are still racy with respect to modification during
invocation. Given that they execute in an async signal unsafe situation,
not sure how much I care.
2 changes: 2 additions & 0 deletions include/wg14_signals/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

// #define WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL 0

#ifndef WG14_SIGNALS_CONFIG_H
#define WG14_SIGNALS_CONFIG_H

Expand Down
2 changes: 1 addition & 1 deletion include/wg14_signals/current_thread_id.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ extern "C"
//! \brief The type of a thread id
typedef uintptr_t WG14_SIGNALS_PREFIX(thread_id_t);

#if WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL
static const WG14_SIGNALS_PREFIX(thread_id_t)
WG14_SIGNALS_PREFIX(thread_id_t_tombstone) = 0;

#if WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL
#ifdef _WIN32
static
#else
Expand Down
35 changes: 24 additions & 11 deletions include/wg14_signals/thrd_signal_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,6 @@ typedef int WG14_SIGNALS_PREFIX(thrd_raised_signal_error_code_t);
*/
struct WG14_SIGNALS_PREFIX(thrd_raised_signal_info)
{
const jmp_buf
*buf; //!< setjmp() buffer written on entry to guarded section
int signo; //!< The signal raised

//! The system specific error code for this signal, the `si_errno` code
Expand Down Expand Up @@ -139,17 +137,25 @@ typedef int WG14_SIGNALS_PREFIX(thrd_raised_signal_error_code_t);
#pragma warning(push)
#pragma warning(disable : 4190) // C-linkage with UDTs
#endif
/*! \brief Installs a thread-local signal guard for the calling thread, and
calls the guarded function `guarded`.
/*! \brief THREADSAFE USUALLY ASYNC-SIGNAL-SAFE Installs a thread-local signal
guard for the calling thread, and calls the guarded function `guarded`.
\return The value returned by `guarded`, or `recovery`.
\param signals The set of signals to guard against.
\param guarded A function whose execution is to be guarded against signal
raises.
\param recovery A function to be called if a signal is raised.
\param decider A function to be called to decide whether to recover from the
signal and continue the execution of the guarded routine, or to abort and call
the recovery routine.
\param decider A function to be called to decide whether to
recover from the signal and continue the execution of the guarded routine, or
to abort and call the recovery routine.
\param value A value to supply to the guarded routine.
By "usually async signal safe" we mean that if any function from this library
has been called from the called from the calling thread, this is async signal
safe. If you need to set up this library for a calling thread without doing
anything else, calling `thrd_signal_raise(0, nullptr, nullptr)`, this will
ensure the calling thread's thread local state is set up and return
immediately doing nothing else.
*/
WG14_SIGNALS_EXTERN union WG14_SIGNALS_PREFIX(thrd_raised_signal_info_value)
WG14_SIGNALS_PREFIX(thrd_signal_invoke)(
Expand All @@ -161,9 +167,9 @@ typedef int WG14_SIGNALS_PREFIX(thrd_raised_signal_error_code_t);
#pragma warning(pop)
#endif

/*! \brief Call OUR currently installed signal decider for a signal (POSIX),
or raise a Win32 structured exception (Windows), returning false if we have no
decider installed for that signal.
/*! \brief THREADSAFE USUALLY ASYNC-SIGNAL-SAFE Call OUR currently installed
signal decider for a signal (POSIX), or raise a Win32 structured exception
(Windows), returning false if we have no decider installed for that signal.
Note that on POSIX, we fetch OUR currently installed signal decider and call
it directly. This allows us to supply custom `raw_info` and `raw_context`.
Expand All @@ -181,6 +187,12 @@ typedef int WG14_SIGNALS_PREFIX(thrd_raised_signal_error_code_t);
On Windows, Win32 structured exceptions are capable of being used directly and
so we do on that platform always call `RaiseException()`.
By "usually async signal safe" we mean that if any function from this library
has been called from the called from the calling thread, this is async signal
safe. If you need to set up this library for a calling thread without doing
anything else, specify zero for `signo`, this will ensure the calling thread's
thread local state is set up and return immediately doing nothing else.
*/
WG14_SIGNALS_EXTERN bool
WG14_SIGNALS_PREFIX(thrd_signal_raise)(int signo, void *raw_info,
Expand Down Expand Up @@ -227,7 +239,8 @@ typedef int WG14_SIGNALS_PREFIX(thrd_raised_signal_error_code_t);
/*! \brief THREADSAFE NOT REENTRANT Create a global signal continuation
decider. Threadsafe with respect to other calls of this function, but not
reentrant i.e. modifying the global signal continuation decider registry
whilst inside a global signal continuation decider is racy. Called after all
whilst inside a global signal continuation decider is racy, and in any case
definitely not async signal handler safe. Called after all
thread local handling is exhausted. Note that what you can safely do in the
decider function is extremely limited, only async signal safe functions may be
called.
Expand Down
5 changes: 5 additions & 0 deletions src/wg14_signals/thrd_signal_handle_posix.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ bool WG14_SIGNALS_PREFIX(thrd_signal_raise)(int signo, void *raw_info,
{
return false;
}
if(signo == 0)
{
// Caller is doing the non-async safe setup
return false;
}
struct thrd_signal_global_state_tss_state_t *tss =
thrd_signal_global_tss_state();
struct thrd_signal_global_state_tss_state_per_frame_t *frame = tss->front;
Expand Down
21 changes: 18 additions & 3 deletions src/wg14_signals/tss_async_signal_safe.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,20 @@ struct WG14_SIGNALS_PREFIX(tss_async_signal_safe)
thread_id_to_tls_map_t thread_id_to_tls_map;
};

// Keep a local cache of the current thread id, if thread locals aren't async
// signal safe on this platform it doesn't matter as we'll ensure it is
// initialised from outside the signal handler
static WG14_SIGNALS_PREFIX(thread_id_t) my_current_thread_id(void)
{
static _Thread_local WG14_SIGNALS_PREFIX(thread_id_t)
current_thread_id_mycache;
if(current_thread_id_mycache == WG14_SIGNALS_PREFIX(thread_id_t_tombstone))
{
current_thread_id_mycache = WG14_SIGNALS_PREFIX(current_thread_id)();
}
return current_thread_id_mycache;
}

int WG14_SIGNALS_PREFIX(tss_async_signal_safe_create)(
WG14_SIGNALS_PREFIX(tss_async_signal_safe) * val,
const struct WG14_SIGNALS_PREFIX(tss_async_signal_safe_attr) * attr)
Expand Down Expand Up @@ -94,7 +108,7 @@ struct deinit_state *state)
(struct WG14_SIGNALS_PREFIX(tss_async_signal_safe) *) state->val;
if(mem != WG14_SIGNALS_NULLPTR)
{
const uint64_t mytid = WG14_SIGNALS_PREFIX(current_thread_id)();
const uint64_t mytid = my_current_thread_id();
LOCK(mem->lock);
thread_id_to_tls_map_t_itr it =
thread_id_to_tls_map_t_get(&mem->thread_id_to_tls_map, mytid);
Expand Down Expand Up @@ -123,7 +137,8 @@ WG14_SIGNALS_PREFIX(tss_async_signal_safe) val)
{
struct WG14_SIGNALS_PREFIX(tss_async_signal_safe) *mem =
(struct WG14_SIGNALS_PREFIX(tss_async_signal_safe) *) val;
const uint64_t mytid = WG14_SIGNALS_PREFIX(current_thread_id)();
// This will force init the TLS from outside a signal handle
const uint64_t mytid = my_current_thread_id();
LOCK(mem->lock);
thread_id_to_tls_map_t_itr it =
thread_id_to_tls_map_t_get(&mem->thread_id_to_tls_map, mytid);
Expand Down Expand Up @@ -175,7 +190,7 @@ WG14_SIGNALS_PREFIX(tss_async_signal_safe) val)
{
struct WG14_SIGNALS_PREFIX(tss_async_signal_safe) *mem =
(struct WG14_SIGNALS_PREFIX(tss_async_signal_safe) *) val;
const uint64_t mytid = WG14_SIGNALS_PREFIX(current_thread_id)();
const uint64_t mytid = my_current_thread_id();
void *ret = WG14_SIGNALS_NULLPTR;
LOCK(mem->lock);
thread_id_to_tls_map_t_itr it =
Expand Down
3 changes: 3 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
add_code_test(async_signal_safe_tls_test SOURCES "async_signal_safe_tls_test.c" FEATURES c_std_11)
add_code_test(benchmark_async_signal_safe_tls_test SOURCES "benchmark_async_signal_safe_tls_test.c" FEATURES c_std_11)

add_code_test(thrd_signal_handle_test SOURCES "thrd_signal_handle_test.c" FEATURES c_std_11)
add_code_test(benchmark_thrd_signal_handle_test SOURCES "benchmark_thrd_signal_handle_test.c" FEATURES c_std_11)
79 changes: 79 additions & 0 deletions test/benchmark_async_signal_safe_tls_test.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#include "test_common.h"

#include "ticks_clock.h"

#include "wg14_signals/tss_async_signal_safe.h"

#include <stdatomic.h>

#define STRINGIZE2(x) #x
#define STRINGIZE(x) STRINGIZE2(x)

static unsigned storage[2] = {5, 6};
static unsigned *storage_ptr = storage;
static int create(void **dest)
{
*dest = storage_ptr++;
return 0;
}
static int destroy(void *dest)
{
return 0;
}

struct shared_t
{
WG14_SIGNALS_PREFIX(tss_async_signal_safe) tls;
} shared;

int main()
{
int ret = 0;
struct WG14_SIGNALS_PREFIX(tss_async_signal_safe_attr)
attr = {.create = create, .destroy = destroy};
CHECK(-1 !=
WG14_SIGNALS_PREFIX(tss_async_signal_safe_create)(&shared.tls, &attr));
CHECK(-1 !=
WG14_SIGNALS_PREFIX(tss_async_signal_safe_thread_init)(shared.tls));
{
volatile unsigned *val =
(unsigned *) WG14_SIGNALS_PREFIX(tss_async_signal_safe_get)(shared.tls);
if(val == WG14_SIGNALS_NULLPTR)
{
abort();
}
}
puts("Preparing benchmark ...");
{
const ns_count begin = get_ns_count();
ns_count end = begin;
do
{
} while(end = get_ns_count(), end - begin < 1000000000);
}
puts("Running benchmark ...");
const ns_count begin = get_ns_count();
ns_count end = begin;
cpu_ticks_count ticks = 0, ops = 0;
do
{
for(size_t n = 0; n < 65536; n++)
{
cpu_ticks_count s = get_ticks_count(memory_order_relaxed);
volatile unsigned *val =
(unsigned *) WG14_SIGNALS_PREFIX(tss_async_signal_safe_get)(shared.tls);
cpu_ticks_count e = get_ticks_count(memory_order_relaxed);
ticks += e - s;
ops++;
}
} while(end = get_ns_count(), end - begin < 3000000000);
printf(
"\nOn this platform (WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL "
"= " STRINGIZE(WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL) "), tss_async_signal_safe_get() "
"takes %f nanoseconds.\n\n",
(double) ticks / (double) ops);

CHECK(-1 != WG14_SIGNALS_PREFIX(tss_async_signal_safe_destroy)(shared.tls));
printf("Exiting main with result %d ...\n", ret);
return ret;
}
Loading

0 comments on commit 24bdc4d

Please sign in to comment.