Skip to content

Commit

Permalink
Build fixes for ARM64 Windows.
Browse files Browse the repository at this point in the history
  • Loading branch information
ned14 committed Feb 21, 2025
1 parent 24bdc4d commit 3dba1b3
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 8 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ if(PROJECT_IS_TOP_LEVEL AND (NOT DEFINED BUILD_TESTING OR BUILD_TESTING))
function(add_code_example target)
cmake_parse_arguments(ADD_EXAMPLE "" "" "SOURCES;FEATURES;PROPERTIES" ${ARGN})
add_executable(${target} ${ADD_EXAMPLE_SOURCES})
if(WIN32)
target_compile_options(${target} PRIVATE /W4 /experimental:c11atomics)
else()
target_compile_options(${target} PRIVATE -Wall -Wextra -Wpedantic)
endif()
target_compile_features(${target} PRIVATE ${ADD_EXAMPLE_FEATURES})
target_link_libraries(${target} PRIVATE ${PROJECT_NAME})
set_target_properties(${target} PROPERTIES
Expand Down
15 changes: 13 additions & 2 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ is unfortunate.
## Performance
On my Threadripper 5975WX which is a 3.6Ghz processor bursting to 4.5Ghz
on Linux:
### On my Threadripper 5975WX which is a 3.6Ghz processor bursting to 4.5Ghz on Linux:
- `tss_async_signal_safe_get()` which implements an async signal safe
thread local storage using a hash table costs about 29 nanoseconds, so
Expand All @@ -100,6 +99,18 @@ handles any signals raised costs about 45 nanoseconds for the happy case.
- A globally installed signal decider takes about 55 nanoseconds to reach.
### On a MacBook Pro M3 running ARM64 Windows within a VM
- `tss_async_signal_safe_get()` which implements an async signal safe
thread local storage using a hash table costs about 22 nanoseconds, so
maybe 130 clock cycles.
- `thrd_signal_invoke()` which invokes a function which thread locally
handles any signals raised costs about 17 nanoseconds (this is Windows
Structured Exception Handling, not our library code).
- A globally installed signal decider takes about 7,372 nanoseconds to reach
(this is also Windows code, not our library code, shame it is so slow).
# Todo
Expand Down
6 changes: 5 additions & 1 deletion test/benchmark_async_signal_safe_tls_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ static int create(void **dest)
}
static int destroy(void *dest)
{
(void) dest;
return 0;
}

Expand Down Expand Up @@ -51,6 +52,8 @@ int main()
{
} while(end = get_ns_count(), end - begin < 1000000000);
}
const cpu_ticks_count ticks_per_sec = ticks_per_second();
printf("There are %llu ticks per second.\n", ticks_per_sec);
puts("Running benchmark ...");
const ns_count begin = get_ns_count();
ns_count end = begin;
Expand All @@ -62,6 +65,7 @@ int main()
cpu_ticks_count s = get_ticks_count(memory_order_relaxed);
volatile unsigned *val =
(unsigned *) WG14_SIGNALS_PREFIX(tss_async_signal_safe_get)(shared.tls);
(void) val;
cpu_ticks_count e = get_ticks_count(memory_order_relaxed);
ticks += e - s;
ops++;
Expand All @@ -71,7 +75,7 @@ int main()
"\nOn this platform (WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL "
"= " STRINGIZE(WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL) "), tss_async_signal_safe_get() "
"takes %f nanoseconds.\n\n",
(double) ticks / (double) ops);
(double) ticks / ((double) ticks_per_sec / 1000000000.0) / (double) ops);

CHECK(-1 != WG14_SIGNALS_PREFIX(tss_async_signal_safe_destroy)(shared.tls));
printf("Exiting main with result %d ...\n", ret);
Expand Down
7 changes: 5 additions & 2 deletions test/benchmark_thrd_signal_handle_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ sigill_recovery_func(const struct WG14_SIGNALS_PREFIX(thrd_raised_signal_info) *
static bool
sigill_decider_func(struct WG14_SIGNALS_PREFIX(thrd_raised_signal_info) * rsi)
{
(void) rsi;
return true; // handled
}
static union WG14_SIGNALS_PREFIX(thrd_raised_signal_info_value)
Expand Down Expand Up @@ -44,6 +45,8 @@ int main()
sigaddset(&guarded, SIGILL);
union WG14_SIGNALS_PREFIX(thrd_raised_signal_info_value)
value = {.int_value = 0};
const cpu_ticks_count ticks_per_sec = ticks_per_second();
printf("There are %llu ticks per second.\n", ticks_per_sec);

puts("Benchmarking thread local handling ...");
{
Expand All @@ -67,7 +70,7 @@ int main()
"\nOn this platform (WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL "
"= " STRINGIZE(WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL) "), thrd_signal_invoke() "
"takes %f nanoseconds.\n\n",
(double) ticks / (double) ops);
(double) ticks / ((double) ticks_per_sec / 1000000000.0) / (double) ops);
}

puts("Benchmarking global handling ...");
Expand All @@ -94,7 +97,7 @@ int main()
"= " STRINGIZE(WG14_SIGNALS_HAVE_ASYNC_SAFE_THREAD_LOCAL) "), invoking a globally "
"installed decider "
"takes %f nanoseconds.\n\n",
(double) ticks / (double) ops);
(double) ticks / ((double) ticks_per_sec / 1000000000.0) / (double) ops);
WG14_SIGNALS_PREFIX(signal_decider_destroy(sigill_decider));
}

Expand Down
18 changes: 15 additions & 3 deletions test/ticks_clock.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ defined(_M_X64)
}
return (uint64_t) lo | ((uint64_t) hi << 32);
#endif
#elif defined(_MSC_VER) && !defined(__clang__)
(void) rel;
LARGE_INTEGER val;
if(!QueryPerformanceCounter(&val))
{
#if _WIN32_WINNT >= 0x600
return (cpu_ticks_count) GetTickCount64() * 1000000;
#else
return (cpu_ticks_count) GetTickCount() * 1000000;
#endif
}
return (cpu_ticks_count) val.QuadPart;
#elif defined(__aarch64__) || defined(_M_ARM64)
uint64_t value = 0;
switch(rel)
Expand All @@ -81,7 +93,7 @@ defined(_M_X64)
break;
case memory_order_acq_rel:
case memory_order_seq_cst:
__asm__ __volatile__("dsb; mrs %0, PMCCNTR_EL0; dsb"
__asm__ __volatile__("dsb\nmrs %0\nPMCCNTR_EL0; dsb"
: "=r"(value)); // NOLINT
break;
default:
Expand All @@ -98,7 +110,7 @@ defined(_M_X64)
{
#ifdef _WIN32
static double scalefactor;
if(!scalefactor)
if(scalefactor == 0.0)
{
LARGE_INTEGER ticksPerSec;
if(QueryPerformanceFrequency(&ticksPerSec))
Expand Down Expand Up @@ -164,7 +176,7 @@ defined(_M_X64)
ns_count ts2 = get_ns_count();
cpu_ticks_count count2b = get_ticks_count(memory_order_acq_rel);
results[n] = (double) (count2a + count2b - count1a - count1b) / 2.0 /
((double) ts2 - ts1) / 1000000000.0;
((double) ts2 - ts1) * 1000000000.0;
}
qsort(results, 10, sizeof(double), ticks_per_second_comp);
v = (cpu_ticks_count) ((results[4] + results[5]) / 2);
Expand Down

0 comments on commit 3dba1b3

Please sign in to comment.