From 34b63ba83542cad8675f875c9aa849653ead378d Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Sun, 25 Oct 2020 20:51:22 +0530 Subject: [PATCH] Detect SSE4.2 support dynamically This is a port of the dynamic SSE4.2 detection feature from moonjit. This makes luajit2 builds portable since SSE4.2 string hash functions are now built separately and chosen at runtime based on whether the CPU supports it. This patch also includes work by Thomas Fransham in moonjit to support Windows builds. --- src/Makefile | 18 ++- src/lj_arch.h | 4 + src/lj_init.c | 69 ++++++++++++ src/lj_jit.h | 1 + src/lj_str.c | 25 +++-- src/lj_str.h | 12 ++ .../src/lj_str_hash_x64.h => lj_str_hash.c} | 106 +++++++++++------- src/ljamalg.c | 1 - src/x64/test/benchmark.cxx | 13 ++- src/x64/test/test.cpp | 10 +- 10 files changed, 195 insertions(+), 64 deletions(-) create mode 100644 src/lj_init.c rename src/{x64/src/lj_str_hash_x64.h => lj_str_hash.c} (76%) diff --git a/src/Makefile b/src/Makefile index dacb95bf4..c804f6bd9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -507,10 +507,16 @@ LJCORE_O= lj_assert.o lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o lj_buf.o \ lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_ccallback.o \ lj_carith.o lj_clib.o lj_cparse.o \ lj_lib.o lj_alloc.o lib_aux.o \ - $(LJLIB_O) lib_init.o + $(LJLIB_O) lib_init.o lj_str_hash.o + +ifeq (x64,$(TARGET_LJARCH)) + lj_str_hash-CFLAGS = -msse4.2 +endif + +F_CFLAGS = $($(patsubst %.c,%-CFLAGS,$<)) LJVMCORE_O= $(LJVM_O) $(LJCORE_O) -LJVMCORE_DYNO= $(LJVMCORE_O:.o=_dyn.o) +LJVMCORE_DYNO= $(LJVMCORE_O:.o=_dyn.o) lj_init_dyn.o LIB_VMDEF= jit/vmdef.lua LIB_VMDEFP= $(LIB_VMDEF) @@ -532,7 +538,7 @@ ALL_RM= $(ALL_T) $(ALL_GEN) *.o host/*.o $(WIN_RM) ############################################################################## # Mixed mode defaults. -TARGET_O= $(LUAJIT_A) +TARGET_O= lj_init.o $(LUAJIT_A) TARGET_T= $(LUAJIT_T) $(LUAJIT_SO) TARGET_DEP= $(LIB_VMDEF) $(LUAJIT_SO) @@ -614,7 +620,7 @@ E= @echo default all: $(TARGET_T) amalg: - $(MAKE) all "LJCORE_O=ljamalg.o" + $(MAKE) all "LJCORE_O=ljamalg.o lj_str_hash.o" clean: $(HOST_RM) $(ALL_RM) @@ -691,8 +697,8 @@ lj_folddef.h: $(BUILDVM_T) lj_opt_fold.c %.o: %.c $(E) "CC $@" - $(Q)$(TARGET_DYNCC) $(TARGET_ACFLAGS) -c -o $(@:.o=_dyn.o) $< - $(Q)$(TARGET_CC) $(TARGET_ACFLAGS) -c -o $@ $< + $(Q)$(TARGET_DYNCC) $(TARGET_ACFLAGS) $(F_CFLAGS) -c -o $(@:.o=_dyn.o) $< + $(Q)$(TARGET_CC) $(TARGET_ACFLAGS) $(F_CFLAGS) -c -o $@ $< %.o: %.S $(E) "ASM $@" diff --git a/src/lj_arch.h b/src/lj_arch.h index b72b07283..327696760 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -209,6 +209,10 @@ #define LJ_TARGET_GC64 1 #endif +#ifdef __GNUC__ +#define LJ_HAS_OPTIMISED_HASH 1 +#endif + #elif LUAJIT_TARGET == LUAJIT_ARCH_ARM #define LJ_ARCH_NAME "arm" diff --git a/src/lj_init.c b/src/lj_init.c new file mode 100644 index 000000000..a6816e1e6 --- /dev/null +++ b/src/lj_init.c @@ -0,0 +1,69 @@ +#include +#include "lj_arch.h" +#include "lj_jit.h" +#include "lj_vm.h" +#include "lj_str.h" + +#if LJ_TARGET_ARM && LJ_TARGET_LINUX +#include +#endif + +#ifdef _MSC_VER +/* +** Append a function pointer to the static constructor table executed by +** the C runtime. +** Based on https://stackoverflow.com/questions/1113409/attribute-constructor-equivalent-in-vc +** see also https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization. +*/ +#pragma section(".CRT$XCU",read) +#define LJ_INITIALIZER2_(f,p) \ + static void f(void); \ + __declspec(allocate(".CRT$XCU")) void (*f##_)(void) = f; \ + __pragma(comment(linker,"/include:" p #f "_")) \ + static void f(void) +#ifdef _WIN64 +#define LJ_INITIALIZER(f) LJ_INITIALIZER2_(f,"") +#else +#define LJ_INITIALIZER(f) LJ_INITIALIZER2_(f,"_") +#endif + +#else +#define LJ_INITIALIZER(f) static void __attribute__((constructor)) f(void) +#endif + + +#ifdef LJ_HAS_OPTIMISED_HASH +static void str_hash_init(uint32_t flags) +{ + if (flags & JIT_F_SSE4_2) + str_hash_init_sse42 (); +} + +/* CPU detection for interpreter features such as string hash function + selection. We choose to cherry-pick from lj_cpudetect and not have a single + initializer to make sure that merges with LuaJIT/LuaJIT remain + convenient. */ +LJ_INITIALIZER(lj_init_cpuflags) +{ + uint32_t flags = 0; +#if LJ_TARGET_X86ORX64 + + uint32_t vendor[4]; + uint32_t features[4]; + if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { + flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; + flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; + flags |= ((features[2] >> 20)&1) * JIT_F_SSE4_2; + if (vendor[0] >= 7) { + uint32_t xfeatures[4]; + lj_vm_cpuid(7, xfeatures); + flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2; + } + } + +#endif + + /* The reason why we initialized early: select our string hash functions. */ + str_hash_init (flags); +} +#endif diff --git a/src/lj_jit.h b/src/lj_jit.h index 0061c25d3..504ed96b3 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -22,6 +22,7 @@ #define JIT_F_SSE3 (JIT_F_CPU << 0) #define JIT_F_SSE4_1 (JIT_F_CPU << 1) #define JIT_F_BMI2 (JIT_F_CPU << 2) +#define JIT_F_SSE4_2 (JIT_F_CPU << 3) #define JIT_F_CPUSTRING "\4SSE3\6SSE4.1\4BMI2" diff --git a/src/lj_str.c b/src/lj_str.c index 9d6943ade..6a8378c2b 100644 --- a/src/lj_str.c +++ b/src/lj_str.c @@ -12,7 +12,6 @@ #include "lj_str.h" #include "lj_char.h" #include "lj_prng.h" -#include "x64/src/lj_str_hash_x64.h" /* -- String helpers ------------------------------------------------------ */ @@ -83,9 +82,22 @@ int lj_str_haspattern(GCstr *s) /* -- String hashing ------------------------------------------------------ */ -#ifndef ARCH_HASH_SPARSE +#ifdef LJ_HAS_OPTIMISED_HASH +static StrHash hash_sparse_def (uint64_t, const char *, MSize); +str_sparse_hashfn hash_sparse = hash_sparse_def; +#if LUAJIT_SECURITY_STRHASH +static StrHash hash_dense_def(uint64_t, StrHash, const char *, MSize); +str_dense_hashfn hash_dense = hash_dense_def; +#endif +#else +#define hash_sparse hash_sparse_def +#if LUAJIT_SECURITY_STRHASH +#define hash_dense hash_dense_def +#endif +#endif + /* Keyed sparse ARX string hash. Constant time. */ -static StrHash hash_sparse(uint64_t seed, const char *str, MSize len) +static StrHash hash_sparse_def(uint64_t seed, const char *str, MSize len) { /* Constants taken from lookup3 hash by Bob Jenkins. */ StrHash a, b, h = len ^ (StrHash)seed; @@ -106,12 +118,11 @@ static StrHash hash_sparse(uint64_t seed, const char *str, MSize len) h ^= b; h -= lj_rol(b, 16); return h; } -#endif -#if LUAJIT_SECURITY_STRHASH && !defined(ARCH_HASH_DENSE) +#if LUAJIT_SECURITY_STRHASH /* Keyed dense ARX string hash. Linear time. */ -static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h, - const char *str, MSize len) +static LJ_NOINLINE StrHash hash_dense_def(uint64_t seed, StrHash h, + const char *str, MSize len) { StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4)); if (len > 12) { diff --git a/src/lj_str.h b/src/lj_str.h index 01c6ba6b5..c9bc28642 100644 --- a/src/lj_str.h +++ b/src/lj_str.h @@ -28,4 +28,16 @@ LJ_FUNC void LJ_FASTCALL lj_str_init(lua_State *L); #define lj_str_newlit(L, s) (lj_str_new(L, "" s, sizeof(s)-1)) #define lj_str_size(len) (sizeof(GCstr) + (((len)+4) & ~(MSize)3)) +#ifdef LJ_HAS_OPTIMISED_HASH +typedef StrHash (*str_sparse_hashfn) (uint64_t, const char *, MSize); +extern str_sparse_hashfn hash_sparse; + +#if LUAJIT_SECURITY_STRHASH +typedef StrHash (*str_dense_hashfn) (uint64_t, StrHash, const char *, MSize); +extern str_dense_hashfn hash_dense; +#endif + +extern void str_hash_init_sse42 (void); +#endif + #endif diff --git a/src/x64/src/lj_str_hash_x64.h b/src/lj_str_hash.c similarity index 76% rename from src/x64/src/lj_str_hash_x64.h rename to src/lj_str_hash.c index e6538953a..0ee4b5f6a 100644 --- a/src/x64/src/lj_str_hash_x64.h +++ b/src/lj_str_hash.c @@ -5,23 +5,48 @@ * to 128 bytes of given string. */ -#ifndef _LJ_STR_HASH_X64_H_ -#define _LJ_STR_HASH_X64_H_ - -#if defined(__SSE4_2__) && defined(__x86_64) && defined(__GNUC__) +#include "lj_arch.h" +#if LJ_HAS_OPTIMISED_HASH == 1 || defined(SMOKETEST) #include #include -#include #include #include -#include "../../lj_def.h" +#if defined(_MSC_VER) +#include +/* Silence deprecated name warning */ +#define getpid _getpid +#else +#include +#endif + +#include "lj_def.h" +#include "lj_str.h" +#include "lj_jit.h" + + +#if defined(_MSC_VER) +/* + * MSVC doesn't seem to restrict intrinsics used based on /arch: value set + * while clang-cl will error on it. + */ +#if defined(__clang__) && !defined(__SSE4_2__) +#error "This file must be built with /arch:AVX1 or higher" +#endif +#else +#if !defined(__SSE4_2__) +#error "This file must be built with -msse4.2" +#endif +#endif + +#define lj_crc32_u32 _mm_crc32_u32 +#define lj_crc32_u64 _mm_crc32_u64 #undef LJ_AINLINE #define LJ_AINLINE -#ifdef __MINGW32__ +#if defined(__MINGW32__) || defined(_MSC_VER) #define random() ((long) rand()) #define srandom(seed) srand(seed) #endif @@ -49,7 +74,7 @@ static LJ_AINLINE uint32_t hash_sparse_1_4(uint64_t seed, const char* str, v = (v << 8) | str[len >> 1]; v = (v << 8) | str[len - 1]; v = (v << 8) | len; - return _mm_crc32_u32(0, v); + return lj_crc32_u32(0, v); #else uint32_t a, b, h = len ^ seed; @@ -80,9 +105,9 @@ static LJ_AINLINE uint32_t hash_sparse_4_16(uint64_t seed, const char* str, v2 = *cast_uint32p(str + len - 4); } - h = _mm_crc32_u32(0, len ^ seed); - h = _mm_crc32_u64(h, v1); - h = _mm_crc32_u64(h, v2); + h = lj_crc32_u32(0, len ^ seed); + h = lj_crc32_u64(h, v1); + h = lj_crc32_u64(h, v2); return h; } @@ -93,18 +118,18 @@ static uint32_t hash_16_128(uint64_t seed, const char* str, uint64_t h1, h2; uint32_t i; - h1 = _mm_crc32_u32(0, len ^ seed); + h1 = lj_crc32_u32(0, len ^ seed); h2 = 0; for (i = 0; i < len - 16; i += 16) { - h1 += _mm_crc32_u64(h1, *cast_uint64p(str + i)); - h2 += _mm_crc32_u64(h2, *cast_uint64p(str + i + 8)); + h1 += lj_crc32_u64(h1, *cast_uint64p(str + i)); + h2 += lj_crc32_u64(h2, *cast_uint64p(str + i + 8)); }; - h1 = _mm_crc32_u64(h1, *cast_uint64p(str + len - 16)); - h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = lj_crc32_u64(h1, *cast_uint64p(str + len - 16)); + h2 = lj_crc32_u64(h2, *cast_uint64p(str + len - 8)); - return _mm_crc32_u32(h1, h2); + return lj_crc32_u32(h1, h2); } /* ************************************************************************** @@ -147,7 +172,7 @@ static LJ_AINLINE uint32_t log2_floor(uint32_t n) /* This function is to populate `random_pos` such that random_pos[i][*] * contains random value in the range of [2**i, 2**(i+1)). */ -static void x64_init_random(void) +static void str_hash_init_random(void) { int i, seed, rml; @@ -158,8 +183,8 @@ static void x64_init_random(void) } /* Init seed */ - seed = _mm_crc32_u32(0, getpid()); - seed = _mm_crc32_u32(seed, time(NULL)); + seed = lj_crc32_u32(0, getpid()); + seed = lj_crc32_u32(seed, time(NULL)); srandom(seed); /* Now start to populate the random_pos[][]. */ @@ -188,11 +213,6 @@ static void x64_init_random(void) } #undef POW2_MASK -void __attribute__((constructor)) x64_init_random_constructor() -{ - x64_init_random(); -} - /* Return a pre-computed random number in the range of [1**chunk_sz_order, * 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value * may be greater than chunk-size; it is up to the caller to make sure @@ -219,7 +239,7 @@ static LJ_NOINLINE uint32_t hash_128_above(uint64_t seed, const char* str, pos1 = get_random_pos_unsafe(chunk_sz_log2, 0); pos2 = get_random_pos_unsafe(chunk_sz_log2, 1); - h1 = _mm_crc32_u32(0, len ^ seed); + h1 = lj_crc32_u32(0, len ^ seed); h2 = 0; /* loop over 14 chunks, 2 chunks at a time */ @@ -227,29 +247,29 @@ static LJ_NOINLINE uint32_t hash_128_above(uint64_t seed, const char* str, chunk_ptr += chunk_sz, i++) { v = *cast_uint64p(chunk_ptr + pos1); - h1 = _mm_crc32_u64(h1, v); + h1 = lj_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz + pos2); - h2 = _mm_crc32_u64(h2, v); + h2 = lj_crc32_u64(h2, v); } /* the last two chunks */ v = *cast_uint64p(chunk_ptr + pos1); - h1 = _mm_crc32_u64(h1, v); + h1 = lj_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2); - h2 = _mm_crc32_u64(h2, v); + h2 = lj_crc32_u64(h2, v); /* process the trailing part */ - h1 = _mm_crc32_u64(h1, *cast_uint64p(str)); - h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = lj_crc32_u64(h1, *cast_uint64p(str)); + h2 = lj_crc32_u64(h2, *cast_uint64p(str + len - 8)); - h1 = _mm_crc32_u32(h1, h2); + h1 = lj_crc32_u32(h1, h2); return h1; } /* NOTE: the "len" should not be zero */ -static uint32_t hash_sparse(uint64_t seed, const char* str, size_t len) +static StrHash hash_sparse_sse42(uint64_t seed, const char* str, MSize len) { if (len < 4 || len >= 128) return hash_sparse_1_4(seed, str, len); @@ -260,11 +280,10 @@ static uint32_t hash_sparse(uint64_t seed, const char* str, size_t len) /* [4, 16) */ return hash_sparse_4_16(seed, str, len); } -#define ARCH_HASH_SPARSE hash_sparse #if LUAJIT_SECURITY_STRHASH -static uint32_t hash_dense(uint64_t seed, uint32_t h, const char* str, - size_t len) +static StrHash hash_dense_sse42(uint64_t seed, uint32_t h, const char* str, + MSize len) { uint32_t b = lj_bswap(lj_rol(h ^ (uint32_t)(seed >> 32), 4)); @@ -277,11 +296,14 @@ static uint32_t hash_dense(uint64_t seed, uint32_t h, const char* str, /* Otherwise, do the slow crc32 randomization for long strings. */ return hash_128_above(b, str, len); } -#define ARCH_HASH_DENSE hash_dense #endif -#else -#undef ARCH_HASH_SPARSE -#undef ARCH_HASH_DENSE +void str_hash_init_sse42(void) +{ + hash_sparse = hash_sparse_sse42; +#if LUAJIT_SECURITY_STRHASH + hash_dense = hash_dense_sse42; +#endif + str_hash_init_random(); +} #endif -#endif /*_LJ_STR_HASH_X64_H_*/ diff --git a/src/ljamalg.c b/src/ljamalg.c index 56585e6dd..38c8ed608 100644 --- a/src/ljamalg.c +++ b/src/ljamalg.c @@ -86,4 +86,3 @@ #include "lib_jit.c" #include "lib_ffi.c" #include "lib_init.c" - diff --git a/src/x64/test/benchmark.cxx b/src/x64/test/benchmark.cxx index ee247c1ce..1ea8fb6bb 100644 --- a/src/x64/test/benchmark.cxx +++ b/src/x64/test/benchmark.cxx @@ -1,7 +1,10 @@ #include // for gettimeofday() extern "C" { #define LUAJIT_SECURITY_STRHASH 1 -#include "lj_str_hash_x64.h" +#include "../../lj_str.h" +str_sparse_hashfn hash_sparse; +str_dense_hashfn hash_dense; +#include "../../lj_str_hash.c" } #include #include @@ -97,7 +100,7 @@ struct TestFuncWasSparse struct TestFuncIsSparse { uint32_t operator()(uint64_t seed, const char* buf, uint32_t len) { - return hash_sparse(seed, buf, len); + return hash_sparse_sse42(seed, buf, len); } }; @@ -111,7 +114,7 @@ struct TestFuncWasDense struct TestFuncIsDense { uint32_t operator()(uint64_t seed, const char* buf, uint32_t len) { - return hash_dense(seed, 42, buf, len); + return hash_dense_sse42(seed, 42, buf, len); } }; @@ -268,9 +271,9 @@ benchmarkConflictHelper(uint64_t seed, uint32_t bucketNum, for (vector::const_iterator i = strs.begin(), e = strs.end(); i != e; ++i) { uint32_t h1 = original_hash_sparse(seed, i->c_str(), i->size()); - uint32_t h2 = hash_sparse(seed, i->c_str(), i->size()); + uint32_t h2 = hash_sparse_sse42(seed, i->c_str(), i->size()); uint32_t h3 = original_hash_dense(seed, h1, i->c_str(), i->size()); - uint32_t h4 = hash_dense(seed, h2, i->c_str(), i->size()); + uint32_t h4 = hash_dense_sse42(seed, h2, i->c_str(), i->size()); conflictWasSparse[h1 & mask]++; conflictIsSparse[h2 & mask]++; diff --git a/src/x64/test/test.cpp b/src/x64/test/test.cpp index 75f34e9fa..432c7bbbd 100644 --- a/src/x64/test/test.cpp +++ b/src/x64/test/test.cpp @@ -4,10 +4,14 @@ #include #define LUAJIT_SECURITY_STRHASH 1 #include "test_util.hpp" -#include "lj_str_hash_x64.h" +#include "../../lj_str.h" +str_sparse_hashfn hash_sparse; +str_dense_hashfn hash_dense; +#include "../../lj_str_hash.c" using namespace std; + static bool smoke_test() { @@ -24,9 +28,9 @@ smoke_test() 255, 256, 257}; for (unsigned i = 0; i < sizeof(lens)/sizeof(lens[0]); i++) { string s(buf, lens[i]); - uint32_t h = hash_sparse(rand(), s.c_str(), lens[i]); + uint32_t h = hash_sparse_sse42(rand(), s.c_str(), lens[i]); test_printf("%d", h); - test_printf("%d", hash_dense(rand(), h, s.c_str(), lens[i])); + test_printf("%d", hash_dense_sse42(rand(), h, s.c_str(), lens[i])); } return true;