From 08a45e564b33bf75df626479427d5fefbd8c921a Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Fri, 12 Apr 2024 18:48:00 -0600 Subject: [PATCH] clean up memory fence and compare-and-swap kernel functions (#826) CAS variants with different fence properties in "atomic.h" ware a leftover from an early experiment in parallel garbage collection, and that experiment did not turn out to be a good idea. The CAS variants were left half-implemented, and they're not needed, so this commit simplifies to reflect the needed functionality. Also, the commit adds comments to clarify that CAS support is used only for pb mode and to explain trade-offs for inline assembly and compiler intrinsics. Small repairs to the build scripts enable pbarch builds on Windows, for what it's worth. Otherwise, this is just an internal clean-up and does not add anything new or fix any bugs. --- c/atomic.h | 91 +++++++++++++++++++++----------------------------- c/build.zuo | 6 ++-- c/pb.c | 4 +-- c/pb.h | 14 ++++---- c/schsig.c | 2 +- c/types.h | 11 ------ mats/build.zuo | 4 +-- 7 files changed, 53 insertions(+), 79 deletions(-) diff --git a/c/atomic.h b/c/atomic.h index 5ed70c6bc..9ceb2c153 100644 --- a/c/atomic.h +++ b/c/atomic.h @@ -1,39 +1,57 @@ +/* `STORE_FENCE` is used by the storage-management system, but + `ACQUIRE_FENCE`, `RELEASE_FENCE`, and `COMPARE_AND_SWAP_PTR` + are used only by the pb interpreter. + + It's always ok to map `ACQUIRE_FENCE` and `RELEASE_FENCE` to + `STORE_FENCE`. For portability, we mainly rely on a + `__sync_synchronize` intrinsic as provided by reasonably modern + versions of GCC and Clang. In some cases, more specialized fence + variants are available via inline assembly or platform-specific + intrinsics. When inline assembly is written only for `STORE_FENCE` + below, then the only advantage over using `__sync_synchronize` is + to support environments with different or very old compilers. + + For `COMPARE_AND_SWAP_PTR`, we similarlly rely on a GCC/Clang + `__sync_bool_compare_and_swap` intrinsic. Some inline-assembly + versions are here --- but, again, the only advantage of those is to + support environments with different or very old compilers. */ + #if !defined(PTHREADS) # define STORE_FENCE() do { } while (0) #elif defined(_MSC_VER) && defined(_M_ARM64) -# define STORE_FENCE() __dmb(_ARM64_BARRIER_ISHST) +# define STORE_FENCE() __dmb(_ARM64_BARRIER_ISHST) # define ACQUIRE_FENCE() __dmb(_ARM64_BARRIER_ISH) # define RELEASE_FENCE() ACQUIRE_FENCE() #elif defined(__arm64__) || defined(__aarch64__) -# define STORE_FENCE() __asm__ __volatile__ ("dmb ishst" : : : "memory") +# define STORE_FENCE() __asm__ __volatile__ ("dmb ishst" : : : "memory") # define ACQUIRE_FENCE() __asm__ __volatile__ ("dmb ish" : : : "memory") # define RELEASE_FENCE() ACQUIRE_FENCE() #elif defined(__arm__) # if (arm_isa_version >= 7) || (__ARM_ARCH >= 7) -# define STORE_FENCE() __asm__ __volatile__ ("dmb ishst" : : : "memory") +# define STORE_FENCE() __asm__ __volatile__ ("dmb ishst" : : : "memory") # define ACQUIRE_FENCE() __asm__ __volatile__ ("dmb ish" : : : "memory") # define RELEASE_FENCE() ACQUIRE_FENCE() # else -# define STORE_FENCE() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory") +# define STORE_FENCE() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory") # define ACQUIRE_FENCE() STORE_FENCE() # define RELEASE_FENCE() STORE_FENCE() # endif #elif defined(__powerpc64__) -# define STORE_FENCE() __asm__ __volatile__ ("lwsync" : : : "memory") +# define STORE_FENCE() __asm__ __volatile__ ("lwsync" : : : "memory") # define ACQUIRE_FENCE() __asm__ __volatile__ ("sync" : : : "memory") # define RELEASE_FENCE() ACQUIRE_FENCE() #elif defined(__powerpc__) || defined(__POWERPC__) -# define STORE_FENCE() __asm__ __volatile__ ("sync" : : : "memory") +# define STORE_FENCE() __asm__ __volatile__ ("sync" : : : "memory") # define ACQUIRE_FENCE() STORE_FENCE() # define RELEASE_FENCE() STORE_FENCE() #elif defined(__riscv) -# define STORE_FENCE() __asm__ __volatile__ ("fence w,rw" : : : "memory") +# define STORE_FENCE() __asm__ __volatile__ ("fence w,rw" : : : "memory") # define ACQUIRE_FENCE() __asm__ __volatile__ ("fence r,rw" : : : "memory") # define RELEASE_FENCE() __asm__ __volatile__ ("fence rw,r" : : : "memory") #elif defined(__loongarch64) -# define STORE_FENCE() __asm__ __volatile__ ("dbar 0" : : : "memory") -# define ACQUIRE_FENCE() __asm__ __volatile__ ("dbar 0" : : : "memory") -# define RELEASE_FENCE() __asm__ __volatile__ ("dbar 0" : : : "memory") +# define STORE_FENCE() __asm__ __volatile__ ("dbar 0" : : : "memory") +# define ACQUIRE_FENCE() STORE_FENCE() +# define RELEASE_FENCE() STORE_FENCE() #elif (__GNUC__ >= 5) || defined(__clang__) # define STORE_FENCE() __sync_synchronize() # define ACQUIRE_FENCE() STORE_FENCE() @@ -50,40 +68,22 @@ #endif #if !defined(PTHREADS) -# define CAS_ANY_FENCE(a, old, new) ((*(ptr *)(a) == TO_PTR(old)) ? (*(ptr)(a) = TO_PTR(new), 1) : 0) +# define COMPARE_AND_SWAP_PTR(a, old, new) ((*(ptr *)(a) == TO_PTR(old)) ? (*(ptr)(a) = TO_PTR(new), 1) : 0) #elif defined(_MSC_VER) # if ptr_bits == 64 -# define CAS_ANY_FENCE(a, old, new) (_InterlockedCompareExchange64((__int64 *)(a), (__int64)(new), (__int64)(old)) == (__int64)(old)) +# define COMPARE_AND_SWAP_PTR(a, old, new) (_InterlockedCompareExchange64((__int64 *)(a), (__int64)(new), (__int64)(old)) == (__int64)(old)) # else -# define CAS_ANY_FENCE(a, old, new) (_InterlockedCompareExchange64((long *)(a), (long)(new), (long)(old)) == (long)(old)) +# define COMPARE_AND_SWAP_PTR(a, old, new) (_InterlockedCompareExchange((long *)(a), (long)(new), (long)(old)) == (long)(old)) # endif #elif defined(__arm64__) || defined(__aarch64__) -FORCEINLINE int CAS_LOAD_ACQUIRE(volatile void *addr, void *old_val, void *new_val) { +FORCEINLINE int COMPARE_AND_SWAP_PTR(volatile void *addr, void *old_val, void *new_val) { I64 ret; __asm__ __volatile__ ("mov %0, #0\n\t" - "0:\n\t" - "ldaxr x12, [%1, #0]\n\t" - "cmp x12, %2\n\t" - "bne 1f\n\t" - "stxr w7, %3, [%1, #0]\n\t" - "cmp x7, #0\n\t" - "bne 1f\n\t" - "mov %0, #1\n\t" - "1:\n\t" - : "=&r" (ret) - : "r" (addr), "r" (old_val), "r" (new_val) - : "cc", "memory", "x12", "x7"); - return ret; -} -/* same as above, but ldaxr -> ldxr and stxr -> stlxr */ -FORCEINLINE int CAS_STORE_RELEASE(volatile void *addr, void *old_val, void *new_val) { - I64 ret; - __asm__ __volatile__ ("mov %0, #0\n\t" "0:\n\t" "ldxr x12, [%1, #0]\n\t" "cmp x12, %2\n\t" "bne 1f\n\t" - "stlxr w7, %3, [%1, #0]\n\t" + "stxr w7, %3, [%1, #0]\n\t" "cmp x7, #0\n\t" "bne 1f\n\t" "mov %0, #1\n\t" @@ -94,12 +94,8 @@ FORCEINLINE int CAS_STORE_RELEASE(volatile void *addr, void *old_val, void *new_ return ret; } #elif defined(__arm__) && ((arm_isa_version >= 6) || (__ARM_ARCH >= 6)) -FORCEINLINE int S_cas_any_fence(int load_acquire, volatile void *addr, void *old_val, void *new_val) { +FORCEINLINE int COMPARE_AND_SWAP_PTR(volatile void *addr, void *old_val, void *new_val) { int ret; - if (load_acquire) - ACQUIRE_FENCE(); - else - RELEASE_FENCE(); __asm__ __volatile__ ("mov %0, #0\n\t" "0:\n\t" "ldrex r12, [%1]\n\t" @@ -116,17 +112,15 @@ FORCEINLINE int S_cas_any_fence(int load_acquire, volatile void *addr, void *old : "cc", "memory", "r12", "r7"); return ret; } -# define CAS_LOAD_ACQUIRE(a, old, new) S_cas_any_fence(1, a, old, new) -# define CAS_STORE_RELEASE(a, old, new) S_cas_any_fence(0, a, old, new) #elif (__GNUC__ >= 5) || defined(__clang__) -# define CAS_ANY_FENCE(a, old, new) __sync_bool_compare_and_swap((ptr *)(a), TO_PTR(old), TO_PTR(new)) +# define COMPARE_AND_SWAP_PTR(a, old, new) __sync_bool_compare_and_swap((ptr *)(a), TO_PTR(old), TO_PTR(new)) #elif defined(__i386__) || defined(__x86_64__) # if ptr_bits == 64 # define CAS_OP_SIZE "q" # else # define CAS_OP_SIZE "" # endif -FORCEINLINE int S_cas_any_fence(volatile void *addr, void *old_val, void *new_val) { +FORCEINLINE int COMPARE_AND_SWAP_PTR(volatile void *addr, void *old_val, void *new_val) { char result; __asm__ __volatile__("lock; cmpxchg" CAS_OP_SIZE " %3, %0; setz %1" : "=m"(*(void **)addr), "=q"(result) @@ -134,9 +128,8 @@ FORCEINLINE int S_cas_any_fence(volatile void *addr, void *old_val, void *new_va : "memory"); return (int) result; } -# define CAS_ANY_FENCE(a, old, new) S_cas_any_fence(a, old, new) #elif defined(__powerpc64__) -FORCEINLINE int S_cas_any_fence(volatile void *addr, void *old_val, void *new_val) { +FORCEINLINE int COMPARE_AND_SWAP_PTR(volatile void *addr, void *old_val, void *new_val) { int ret, tmp; __asm__ __volatile__ ("li %0, 0\n\t" "0:\n\t" @@ -152,18 +145,10 @@ FORCEINLINE int S_cas_any_fence(volatile void *addr, void *old_val, void *new_va : "cc", "memory"); return ret; } -# define CAS_ANY_FENCE(a, old, new) S_cas_any_fence(a, old, new) #elif defined(__riscv) # error expected a compiler with a CAS intrinsic for RISC-V #elif defined(__loongarch64) # error expected a compiler with a CAS intrinsic for LoongArch64 #else -# define CAS_ANY_FENCE(a, old, new) ((*(ptr *)(a) == TO_PTR(old)) ? (*(ptr *)(a) = TO_PTR(new), 1) : 0) -#endif - -#ifdef CAS_ANY_FENCE -# define CAS_LOAD_ACQUIRE(a, old, new) CAS_ANY_FENCE(a, old, new) -# define CAS_STORE_RELEASE(a, old, new) CAS_ANY_FENCE(a, old, new) -#else -# define CAS_ANY_FENCE(a, old, new) CAS_LOAD_ACQUIRE(a, old, new) +# define COMPARE_AND_SWAP_PTR(a, old, new) ((*(ptr *)(a) == TO_PTR(old)) ? (*(ptr *)(a) = TO_PTR(new), 1) : 0) #endif diff --git a/c/build.zuo b/c/build.zuo index ae20d148c..b9b5f6d67 100644 --- a/c/build.zuo +++ b/c/build.zuo @@ -504,14 +504,14 @@ " /DSCHEME_STATIC") " /Ox /W3 /DWIN32 /D_CRT_SECURE_NO_WARNINGS" " /Zi /FS /Fd" (string->shell (at-dir "scheme.pdb")) - (if (glob-match? "pb*" (hash-ref config 'm)) + (if (glob-match? "pb*" (m->arch (hash-ref config 'm))) " /DFEATURE_WINDOWS" "")))] [config (hash-set config 'LDFLAGS (if (as-runtime-dll? config) "/MD" "/MT"))] [config (hash-set config 'LIBS "rpcrt4.lib ole32.lib advapi32.lib User32.lib")]) config)] - [(and (eq? 'windows (system-type)) - (glob-match? "pb*" (hash-ref config 'm))) + [(and (glob-match? "*nt" (hash-ref config 'defaultm "")) + (glob-match? "pb*" (m->arch (hash-ref config 'm)))) (config-merge config 'CFLAGS "-DFEATURE_WINDOWS")] [else config])) diff --git a/c/pb.c b/c/pb.c index 2fe29d026..d588d4bb3 100644 --- a/c/pb.c +++ b/c/pb.c @@ -502,7 +502,7 @@ ptr *S_get_call_arena(ptr tc) { #if defined(PTHREADS) void S_pb_spinlock(void *addr) { while (1) { - if (CAS_ANY_FENCE(addr, TO_VOIDP(0), TO_VOIDP(1))) + if (COMPARE_AND_SWAP_PTR(addr, TO_VOIDP(0), TO_VOIDP(1))) break; } } @@ -511,7 +511,7 @@ int S_pb_locked_adjust(void *addr, int delta) { while (1) { uptr oldv = *(uptr *)addr; uptr newv = oldv + delta; - if (CAS_ANY_FENCE(addr, TO_VOIDP(oldv), TO_VOIDP(newv))) + if (COMPARE_AND_SWAP_PTR(addr, TO_VOIDP(oldv), TO_VOIDP(newv))) return newv == 0; } } diff --git a/c/pb.h b/c/pb.h index 4eba73d80..ddbaea03a 100644 --- a/c/pb.h +++ b/c/pb.h @@ -956,10 +956,10 @@ enum { *(float *)TO_VOIDP(regs[reg] + imm) = (float)fpregs[dest] #if defined(PTHREADS) -# define CAS_ANY_FENCE_SEQOK(addr, old_r, r) \ - CAS_ANY_FENCE(TO_VOIDP(addr), TO_VOIDP(old_r), TO_VOIDP(r)) +# define COMPARE_AND_SWAP_PTR_SEQOK(addr, old_r, r) \ + COMPARE_AND_SWAP_PTR(TO_VOIDP(addr), TO_VOIDP(old_r), TO_VOIDP(r)) #else -# define CAS_ANY_FENCE_SEQOK(addr, old_r, r) \ +# define COMPARE_AND_SWAP_PTR_SEQOK(addr, old_r, r) \ (*(uptr *)TO_VOIDP(addr) = r, 1) #endif @@ -971,7 +971,7 @@ enum { while (1) { \ uptr old_r = *(uptr *)TO_VOIDP(addr); \ uptr r = old_r + regs[reg]; \ - if (CAS_ANY_FENCE_SEQOK(addr, old_r, r)) { \ + if (COMPARE_AND_SWAP_PTR_SEQOK(addr, old_r, r)) { \ flag = (r == 0); \ break; \ } \ @@ -986,7 +986,7 @@ enum { while (1) { \ uptr old_r = *(uptr *)TO_VOIDP(addr); \ uptr r = old_r + imm; \ - if (CAS_ANY_FENCE_SEQOK(addr, old_r, r)) { \ + if (COMPARE_AND_SWAP_PTR_SEQOK(addr, old_r, r)) { \ flag = (r == 0); \ break; \ } \ @@ -999,7 +999,7 @@ enum { # define do_pb_lock(dest) \ do { \ uptr *l = TO_VOIDP(regs[dest]); \ - flag = CAS_ANY_FENCE(l, TO_VOIDP(0), TO_VOIDP(1)); \ + flag = COMPARE_AND_SWAP_PTR(l, TO_VOIDP(0), TO_VOIDP(1)); \ } while (0) #else # define doi_pb_lock(instr) \ @@ -1023,7 +1023,7 @@ enum { uptr *l = TO_VOIDP(regs[dest]); \ uptr old = regs[reg1]; \ uptr new = regs[reg2]; \ - flag = CAS_ANY_FENCE(l, TO_VOIDP(old), TO_VOIDP(new)); \ + flag = COMPARE_AND_SWAP_PTR(l, TO_VOIDP(old), TO_VOIDP(new)); \ } while (0) #else #define doi_pb_cas(instr) \ diff --git a/c/schsig.c b/c/schsig.c index 046777306..b8290d5b5 100644 --- a/c/schsig.c +++ b/c/schsig.c @@ -568,7 +568,7 @@ static BOOL WINAPI handle_signal(DWORD dwCtrlType) { case CTRL_BREAK_EVENT: { #ifdef PTHREADS /* get_thread_context() always returns 0, so assume main thread */ - ptr tc = S_G.thread_context; + ptr tc = TO_PTR(S_G.thread_context); #else ptr tc = get_thread_context(); #endif diff --git a/c/types.h b/c/types.h index 594d61959..440bd68c9 100644 --- a/c/types.h +++ b/c/types.h @@ -449,12 +449,6 @@ typedef struct { # define END_IMPLICIT_ATOMIC() do { } while (0) #endif -#define S_cas_load_acquire_voidp(a, old, new) CAS_LOAD_ACQUIRE(a, old, new) -#define S_cas_store_release_voidp(a, old, new) CAS_STORE_RELEASE(a, old, new) -#define S_cas_load_acquire_ptr(a, old, new) CAS_LOAD_ACQUIRE(a, TO_VOIDP(old), TO_VOIDP(new)) -#define S_cas_store_release_ptr(a, old, new) CAS_STORE_RELEASE(a, TO_VOIDP(old), TO_VOIDP(new)) -#define S_store_release() RELEASE_FENCE() - #else #define get_thread_context() TO_PTR(S_G.thread_context) #define deactivate_thread(tc) {} @@ -465,11 +459,6 @@ typedef struct { #define alloc_mutex_release() do {} while (0) #define IS_TC_MUTEX_OWNER() 1 #define IS_ALLOC_MUTEX_OWNER() 1 -#define S_cas_load_acquire_voidp(a, old, new) (*(a) = new, 1) -#define S_cas_store_release_voidp(a, old, new) (*(a) = new, 1) -#define S_cas_load_acquire_ptr(a, old, new) (*(a) = new, 1) -#define S_cas_store_release_ptr(a, old, new) (*(a) = new, 1) -#define S_store_release() do { } while (0) #define BEGIN_IMPLICIT_ATOMIC() do { } while (0) #define END_IMPLICIT_ATOMIC() do { } while (0) #define AS_IMPLICIT_ATOMIC(T, X) X diff --git a/mats/build.zuo b/mats/build.zuo index d355d7d48..5331c7212 100644 --- a/mats/build.zuo +++ b/mats/build.zuo @@ -363,7 +363,7 @@ "cc"))) (cons "FTYPE_CFLAGS" (build-shell (or (lookup 'CPPFLAGS) "") - (if (glob-match? "*nt" m) + (if (eq? 'windows (system-type)) "-DWIN32" "") (or (lookup 'mdinclude) "") @@ -759,7 +759,7 @@ (string->shell (at-dir "../bin" m (~a "csv" (get-dll-version) ".lib"))))] - [(glob-match? "*nt" m) + [(eq? 'windows (system-type)) (config-merge config 'LDFLAGS (string->shell (at-dir "../boot" m "schemeexe.lib")))] [else config])])