From a83fb24e1cb0ec6b6fd53446c941013edf055192 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 01/15] Drop support for very old versions of gcc and clang Drop support for gcc before v4.9 and clang before v3.9. This will allow simplifying some code. For example, we'll be able to assume that if the compiler is gcc or clang, then on x86 the target function attribute will work properly and intrinsics up to AVX2 will be supported. Document and start explicitly enforcing the minimum compiler versions. For MSVC, start enforcing a minimum of Visual Studio 2015. However, I believe this was already required due to the use of stdbool.h. --- README.md | 21 +++++++++++++++++++-- common_defs.h | 14 ++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 81cfc867..d7fa34cb 100644 --- a/README.md +++ b/README.md @@ -77,11 +77,28 @@ You should compile both `lib/*.c` and `lib/*/*.c`. You don't need to worry about excluding irrelevant architecture-specific code, as this is already handled in the source files themselves using `#ifdef`s. -It is strongly recommended to use either gcc or clang, and to use `-O2`. - If you are doing a freestanding build with `-ffreestanding`, you must add `-DFREESTANDING` as well (matching what the `CMakeLists.txt` does). +## Supported compilers + +- gcc: v4.9 and later +- clang: v3.9 and later (upstream), Xcode 8 and later (Apple) +- MSVC: Visual Studio 2015 and later +- Other compilers: any other C99-compatible compiler should work, though if your + compiler pretends to be gcc, clang, or MSVC, it needs to be sufficiently + compatible with the compiler it pretends to be. + +The above are the minimums, but using a newer compiler allows more of the +architecture-optimized code to be built. libdeflate is most heavily optimized +for gcc and clang, but MSVC is supported fairly well now too. + +The recommended optimization flag is `-O2`, and the `CMakeLists.txt` sets this +for release builds. `-O3` is fine too, but often `-O2` actually gives better +results. It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though +you can do so if you want to. Most of the relevant optimized functions are +built regardless of such flags, and appropriate ones are selected at runtime. + # API libdeflate has a simple API that is not zlib-compatible. You can create diff --git a/common_defs.h b/common_defs.h index 0a155371..6f82b321 100644 --- a/common_defs.h +++ b/common_defs.h @@ -135,6 +135,9 @@ typedef size_t machine_word_t; # define GCC_PREREQ(major, minor) \ (__GNUC__ > (major) || \ (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +# if !GCC_PREREQ(4, 9) +# error "gcc versions older than 4.9 are no longer supported" +# endif #else # define GCC_PREREQ(major, minor) 0 #endif @@ -147,9 +150,20 @@ typedef size_t machine_word_t; (__clang_major__ > (major) || \ (__clang_major__ == (major) && __clang_minor__ >= (minor))) # endif +# if !CLANG_PREREQ(3, 9, 8000000) +# error "clang versions older than 3.9 are no longer supported" +# endif #else # define CLANG_PREREQ(major, minor, apple_version) 0 #endif +#ifdef _MSC_VER +# define MSVC_PREREQ(version) (_MSC_VER >= (version)) +# if !MSVC_PREREQ(1900) +# error "MSVC versions older than Visual Studio 2015 are no longer supported" +# endif +#else +# define MSVC_PREREQ(version) 0 +#endif /* * Macros to check for compiler support for attributes and builtins. clang From 92ccd11d53a55860ed98d09a06ac47b45b030431 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 02/15] Get rid of COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE is being misused to check not just whether the target function attribute is supported, but also whether inline assembly is supported (for cpuid). Meanwhile, we no longer support gcc versions that lack support for the target function attribute, which simplifies things somewhat. Therefore, replace COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE with simply checking __GNUC__ || __clang__ directly. --- common_defs.h | 2 -- lib/arm/cpu_features.h | 2 +- lib/x86/cpu_features.h | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/common_defs.h b/common_defs.h index 6f82b321..75e0bc12 100644 --- a/common_defs.h +++ b/common_defs.h @@ -282,10 +282,8 @@ typedef size_t machine_word_t; */ #if GCC_PREREQ(4, 4) || __has_attribute(target) # define _target_attribute(attrs) __attribute__((target(attrs))) -# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 1 #else # define _target_attribute(attrs) -# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 #endif /* ========================================================================== */ diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index c55f007c..d1629c55 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -35,7 +35,7 @@ #if defined(ARCH_ARM32) || defined(ARCH_ARM64) #if !defined(FREESTANDING) && \ - (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \ + (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \ (defined(__linux__) || \ (defined(__APPLE__) && defined(ARCH_ARM64)) || \ (defined(_WIN32) && defined(ARCH_ARM64))) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 4e14f2a8..f613d8d9 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -34,7 +34,7 @@ #if defined(ARCH_X86_32) || defined(ARCH_X86_64) -#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER) +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # undef HAVE_DYNAMIC_X86_CPU_FEATURES # define HAVE_DYNAMIC_X86_CPU_FEATURES 1 #endif From 237f79cb075563ab42f2a61bdc2700c302f74074 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 03/15] common_defs.h: fix docs for __has_attribute and __has_builtin --- common_defs.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/common_defs.h b/common_defs.h index 75e0bc12..af429a2d 100644 --- a/common_defs.h +++ b/common_defs.h @@ -166,13 +166,19 @@ typedef size_t machine_word_t; #endif /* - * Macros to check for compiler support for attributes and builtins. clang - * implements these macros, but gcc doesn't, so generally any use of one of - * these macros must also be combined with a gcc version check. + * __has_attribute(attribute) - check whether the compiler supports the given + * attribute (and also supports doing the check in the first place). Mostly + * useful just for clang, since gcc didn't add this macro until gcc 5. */ #ifndef __has_attribute # define __has_attribute(attribute) 0 #endif + +/* + * __has_builtin(builtin) - check whether the compiler supports the given + * builtin (and also supports doing the check in the first place). Mostly + * useful just for clang, since gcc didn't add this macro until gcc 10. + */ #ifndef __has_builtin # define __has_builtin(builtin) 0 #endif From fa35680f7a0e22dae09a37af603a800ae3130074 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 04/15] common_defs.h: simplify checks for very old gcc versions Since we no longer support gcc or clang versions that lack the target function attribute or bswap builtins, replace the corresponding gcc version checks with simply __GNUC__. (__GNUC__ means gcc-compatible, not gcc per se, but this is fine.) --- common_defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common_defs.h b/common_defs.h index af429a2d..a3773c4e 100644 --- a/common_defs.h +++ b/common_defs.h @@ -286,7 +286,7 @@ typedef size_t machine_word_t; * code as well as the corresponding intrinsics. On other compilers this macro * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway. */ -#if GCC_PREREQ(4, 4) || __has_attribute(target) +#if defined(__GNUC__) || __has_attribute(target) # define _target_attribute(attrs) __attribute__((target(attrs))) #else # define _target_attribute(attrs) @@ -334,7 +334,7 @@ static forceinline bool CPU_IS_LITTLE_ENDIAN(void) /* bswap16(v) - swap the bytes of a 16-bit integer */ static forceinline u16 bswap16(u16 v) { -#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#if defined(__GNUC__) || __has_builtin(__builtin_bswap16) return __builtin_bswap16(v); #elif defined(_MSC_VER) return _byteswap_ushort(v); @@ -346,7 +346,7 @@ static forceinline u16 bswap16(u16 v) /* bswap32(v) - swap the bytes of a 32-bit integer */ static forceinline u32 bswap32(u32 v) { -#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) +#if defined(__GNUC__) || __has_builtin(__builtin_bswap32) return __builtin_bswap32(v); #elif defined(_MSC_VER) return _byteswap_ulong(v); @@ -361,7 +361,7 @@ static forceinline u32 bswap32(u32 v) /* bswap64(v) - swap the bytes of a 64-bit integer */ static forceinline u64 bswap64(u64 v) { -#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) +#if defined(__GNUC__) || __has_builtin(__builtin_bswap64) return __builtin_bswap64(v); #elif defined(_MSC_VER) return _byteswap_uint64(v); From 243a05fb153407425d3394b24403cbb92d8d43f5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 05/15] lib/arm: simplify checks for very old clang versions Since we now only support clang 3.9 and later, the checks for clang 3.4 and clang 3.5 can simply use defined(__clang__). --- lib/arm/cpu_features.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index d1629c55..4fa6e43a 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -98,8 +98,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #if HAVE_PMULL_NATIVE || \ (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \ - (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \ - defined(_MSC_VER)) && \ + (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \ /* * On arm32 with clang, the crypto intrinsics (which include pmull) * are not defined, even when using -mfpu=crypto-neon-fp-armv8, @@ -179,9 +178,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } !defined(__ARM_ARCH_7EM__) # define HAVE_CRC32_INTRIN 1 # endif -# elif CLANG_PREREQ(3, 4, 6000000) -# define HAVE_CRC32_INTRIN 1 -# elif defined(_MSC_VER) +# elif defined(__clang__) || defined(_MSC_VER) # define HAVE_CRC32_INTRIN 1 # endif #endif From 189a997b481f291eec5343c0aa5d5ac508c42d94 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 06/15] lib/x86: centralize the intrinsic header inclusions On all the compilers that we support using x86 intrinsics with, we no longer support very old versions that don't have immintrin.h. Therefore, just include immintrin.h from lib/x86/cpu_features.h, making it available to all code in lib/x86/. Handle the workaround for clang in MSVC compatibility mode in the same place. --- lib/x86/adler32_impl.h | 38 --------------------------------- lib/x86/cpu_features.h | 27 +++++++++++++++++++++++ lib/x86/crc32_pclmul_template.h | 16 -------------- lib/x86/decompress_impl.h | 1 - lib/x86/matchfinder_impl.h | 2 -- 5 files changed, 27 insertions(+), 57 deletions(-) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 618c30cc..564d6826 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -122,7 +122,6 @@ # else # define ATTRIBUTES _target_attribute("sse2") # endif -# include static forceinline ATTRIBUTES void adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) { @@ -220,15 +219,6 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) # else # define ATTRIBUTES _target_attribute("avx2") # endif -# include - /* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -# if defined(__clang__) && defined(_MSC_VER) -# include -# include -# endif static forceinline ATTRIBUTES void adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) { @@ -301,19 +291,6 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) # else # define ATTRIBUTES _target_attribute("avx2,avxvnni") # endif -# include - /* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -# if defined(__clang__) && defined(_MSC_VER) -# include -# include -# include -# include -# include -# include -# endif static forceinline ATTRIBUTES void adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) @@ -390,21 +367,6 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, # else # define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") # endif -# include - /* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -# if defined(__clang__) && defined(_MSC_VER) -# include -# include -# include -# include -# include -# include -# include -# include -# endif static forceinline ATTRIBUTES void adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, u32 *s1, u32 *s2) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index f613d8d9..d6b84975 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -253,6 +253,33 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_AVXVNNI_INTRIN 0 #endif +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +# include +#endif +#if defined(_MSC_VER) && defined(__clang__) + /* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +# include +# include +# include +# include +# include +# include +# include +# include +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif +#endif + #endif /* ARCH_X86_32 || ARCH_X86_64 */ #endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index eb06262b..4257d449 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -62,22 +62,6 @@ * or AVX512VL, or four in combination with AVX512F. */ -#include -/* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -#if defined(__clang__) && defined(_MSC_VER) -# include -# include -# include -# include -# include -# include -# include -# include -#endif - #undef fold_vec128 static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers) diff --git a/lib/x86/decompress_impl.h b/lib/x86/decompress_impl.h index 3e2ec37e..a52305fa 100644 --- a/lib/x86/decompress_impl.h +++ b/lib/x86/decompress_impl.h @@ -24,7 +24,6 @@ * as the bzhi instruction truncates the count to 8 bits implicitly. */ # ifndef __clang__ -# include # ifdef ARCH_X86_64 # define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count)) # define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count)) diff --git a/lib/x86/matchfinder_impl.h b/lib/x86/matchfinder_impl.h index 8433b9b1..21b463a7 100644 --- a/lib/x86/matchfinder_impl.h +++ b/lib/x86/matchfinder_impl.h @@ -31,7 +31,6 @@ #include "cpu_features.h" #if HAVE_AVX2_NATIVE -# include static forceinline void matchfinder_init_avx2(mf_pos_t *data, size_t size) { @@ -76,7 +75,6 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size) #define matchfinder_rebase matchfinder_rebase_avx2 #elif HAVE_SSE2_NATIVE -# include static forceinline void matchfinder_init_sse2(mf_pos_t *data, size_t size) { From 44a6510c0d94bd4737253eada712402033cec01e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 07/15] lib/x86: remove a workaround for very old gcc versions The bug that the EBX_CONSTRAINT hack was working around was fixed in gcc 4.6, and our minimum gcc version is now 4.9. --- lib/x86/cpu_features.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 227fd221..ebe5c569 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -30,16 +30,6 @@ #if HAVE_DYNAMIC_X86_CPU_FEATURES -/* - * With old GCC versions we have to manually save and restore the x86_32 PIC - * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 - */ -#if defined(ARCH_X86_32) && defined(__PIC__) -# define EBX_CONSTRAINT "=&r" -#else -# define EBX_CONSTRAINT "=b" -#endif - /* Execute the CPUID instruction. */ static inline void cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) @@ -56,7 +46,7 @@ cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) __asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" "cpuid \n" ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" - : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) : "a" (leaf), "c" (subleaf)); #endif } From 2c850ba7eff1feebd5517213464fea4d6a0bb3b6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 08/15] lib/x86: get rid of HAVE_TARGET_INTRINSICS Since we no longer support the very old versions of gcc or clang where x86 intrinsics don't work properly, the HAVE_TARGET_INTRINSICS macro is always true when the compiler is gcc, clang, or MSVC. As a result, it's now redundant with other conditions checked, except for HAVE_SSE2_INTRIN. Just remove it. --- lib/x86/cpu_features.h | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index d6b84975..91ee2ccd 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -86,19 +86,6 @@ static inline u32 get_x86_cpu_features(void) static inline u32 get_x86_cpu_features(void) { return 0; } #endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */ -/* - * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics not - * available in the main target couldn't be used in 'target' attribute - * functions. Unfortunately clang has no feature test macro for this, so we - * have to check its version. - */ -#if HAVE_DYNAMIC_X86_CPU_FEATURES && \ - (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER)) -# define HAVE_TARGET_INTRINSICS 1 -#else -# define HAVE_TARGET_INTRINSICS 0 -#endif - /* SSE2 */ #if defined(__SSE2__) || \ (defined(_MSC_VER) && \ @@ -107,7 +94,12 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_SSE2_NATIVE 0 #endif -#define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS) +#if HAVE_SSE2_NATIVE || defined(__GNUC__) || defined(__clang__) || \ + defined(_MSC_VER) +# define HAVE_SSE2_INTRIN 1 +#else +# define HAVE_SSE2_INTRIN 0 +#endif /* PCLMULQDQ */ #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) @@ -115,9 +107,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_PCLMULQDQ_NATIVE 0 #endif -#if HAVE_PCLMULQDQ_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ - defined(_MSC_VER))) +#if HAVE_PCLMULQDQ_NATIVE || GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ + defined(_MSC_VER) # define HAVE_PCLMULQDQ_INTRIN 1 #else # define HAVE_PCLMULQDQ_INTRIN 0 @@ -129,9 +120,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX_NATIVE 0 #endif -#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \ - defined(_MSC_VER))) +#if HAVE_AVX_NATIVE || GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \ + defined(_MSC_VER) # define HAVE_AVX_INTRIN 1 #else # define HAVE_AVX_INTRIN 0 @@ -143,9 +133,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX2_NATIVE 0 #endif -#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ - defined(_MSC_VER))) +#if HAVE_AVX2_NATIVE || GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER) # define HAVE_AVX2_INTRIN 1 #else # define HAVE_AVX2_INTRIN 0 @@ -157,9 +146,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_BMI2_NATIVE 0 #endif -#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ - defined(_MSC_VER))) +#if HAVE_BMI2_NATIVE || GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER) # define HAVE_BMI2_INTRIN 1 #else # define HAVE_BMI2_INTRIN 0 From 2de290dc69d1f4a67c5b691dc310709e604254de Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 09/15] lib/x86: simplify checks for very old gcc and clang versions Since we've bumped up the minimum gcc and clang versions to 4.9 and 3.9 respectively, when checking for support for intrinsics that were already present in those versions simply use __GNUC__ || __clang__. (__GNUC__ means gcc-compatible, not gcc per se, but this is fine.) --- lib/x86/cpu_features.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 91ee2ccd..0ddb2334 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -107,7 +107,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_PCLMULQDQ_NATIVE 0 #endif -#if HAVE_PCLMULQDQ_NATIVE || GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ +#if HAVE_PCLMULQDQ_NATIVE || defined(__GNUC__) || defined(__clang__) || \ defined(_MSC_VER) # define HAVE_PCLMULQDQ_INTRIN 1 #else @@ -120,7 +120,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX_NATIVE 0 #endif -#if HAVE_AVX_NATIVE || GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \ +#if HAVE_AVX_NATIVE || defined(__GNUC__) || defined(__clang__) || \ defined(_MSC_VER) # define HAVE_AVX_INTRIN 1 #else @@ -133,7 +133,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX2_NATIVE 0 #endif -#if HAVE_AVX2_NATIVE || GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ +#if HAVE_AVX2_NATIVE || defined(__GNUC__) || defined(__clang__) || \ defined(_MSC_VER) # define HAVE_AVX2_INTRIN 1 #else @@ -146,7 +146,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_BMI2_NATIVE 0 #endif -#if HAVE_BMI2_NATIVE || GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ +#if HAVE_BMI2_NATIVE || defined(__GNUC__) || defined(__clang__) || \ defined(_MSC_VER) # define HAVE_BMI2_INTRIN 1 #else @@ -169,7 +169,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX512F_NATIVE 0 #endif -#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \ +#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || defined(__clang__) || \ defined(_MSC_VER) # define HAVE_AVX512F_INTRIN 1 #else @@ -182,7 +182,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX512BW_NATIVE 0 #endif -#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \ +#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || defined(__clang__) || \ defined(_MSC_VER) # define HAVE_AVX512BW_INTRIN 1 #else @@ -195,7 +195,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX512VL_NATIVE 0 #endif -#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \ +#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || defined(__clang__) || \ defined(_MSC_VER) # define HAVE_AVX512VL_INTRIN 1 #else From 0c6b4cda05a1446c65e5acd269afffb62ef196d8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 10/15] lib/x86: simplify by not forcing *_INTRIN on when *_NATIVE It's probably safe to assume that if e.g. __AVX2__ is defined then the AVX2 intrinsics are available. However, it doesn't seem worthwhile for the code to check for this because there's always already a compiler version check for using the intrinsics which is sufficient by itself. --- lib/x86/cpu_features.h | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 0ddb2334..d19c8d34 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -94,8 +94,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_SSE2_NATIVE 0 #endif -#if HAVE_SSE2_NATIVE || defined(__GNUC__) || defined(__clang__) || \ - defined(_MSC_VER) +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_SSE2_INTRIN 1 #else # define HAVE_SSE2_INTRIN 0 @@ -107,8 +106,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_PCLMULQDQ_NATIVE 0 #endif -#if HAVE_PCLMULQDQ_NATIVE || defined(__GNUC__) || defined(__clang__) || \ - defined(_MSC_VER) +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_PCLMULQDQ_INTRIN 1 #else # define HAVE_PCLMULQDQ_INTRIN 0 @@ -120,8 +118,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX_NATIVE 0 #endif -#if HAVE_AVX_NATIVE || defined(__GNUC__) || defined(__clang__) || \ - defined(_MSC_VER) +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX_INTRIN 1 #else # define HAVE_AVX_INTRIN 0 @@ -133,8 +130,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX2_NATIVE 0 #endif -#if HAVE_AVX2_NATIVE || defined(__GNUC__) || defined(__clang__) || \ - defined(_MSC_VER) +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX2_INTRIN 1 #else # define HAVE_AVX2_INTRIN 0 @@ -146,8 +142,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_BMI2_NATIVE 0 #endif -#if HAVE_BMI2_NATIVE || defined(__GNUC__) || defined(__clang__) || \ - defined(_MSC_VER) +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_BMI2_INTRIN 1 #else # define HAVE_BMI2_INTRIN 0 @@ -169,8 +164,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX512F_NATIVE 0 #endif -#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || defined(__clang__) || \ - defined(_MSC_VER) +#if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX512F_INTRIN 1 #else # define HAVE_AVX512F_INTRIN 0 @@ -182,8 +176,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX512BW_NATIVE 0 #endif -#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || defined(__clang__) || \ - defined(_MSC_VER) +#if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX512BW_INTRIN 1 #else # define HAVE_AVX512BW_INTRIN 0 @@ -195,8 +188,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX512VL_NATIVE 0 #endif -#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || defined(__clang__) || \ - defined(_MSC_VER) +#if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX512VL_INTRIN 1 #else # define HAVE_AVX512VL_INTRIN 0 @@ -208,8 +200,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_VPCLMULQDQ_NATIVE 0 #endif -#if HAVE_VPCLMULQDQ_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ - defined(_MSC_VER) +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) # define HAVE_VPCLMULQDQ_INTRIN 1 #else # define HAVE_VPCLMULQDQ_INTRIN 0 @@ -221,8 +212,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVX512VNNI_NATIVE 0 #endif -#if HAVE_AVX512VNNI_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ - defined(_MSC_VER) +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) # define HAVE_AVX512VNNI_INTRIN 1 #else # define HAVE_AVX512VNNI_INTRIN 0 @@ -234,8 +224,7 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #else # define HAVE_AVXVNNI_NATIVE 0 #endif -#if HAVE_AVXVNNI_NATIVE || GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || \ - defined(_MSC_VER) +#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || defined(_MSC_VER) # define HAVE_AVXVNNI_INTRIN 1 #else # define HAVE_AVXVNNI_INTRIN 0 From c67ec2619b25fe6885511905b0c67c853001ea49 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 11/15] lib/x86: simplify by not trying to skip target attributes It's true that if e.g. __AVX2__ is defined then it's unnecessary to use __attribute__((target("avx2"))) on functions that use AVX-2 intrinsics. But it seems to be harmless to add it. All the target options we use on x86 seem to work additively, so at worst it's a no-op. We also no longer support any gcc or clang versions that don't support the target function attribute. Therefore, simplify the code by not skipping using the target function attribute when all the options are already enabled in the main target. Note that this change has no effect on MSVC builds, as MSVC doesn't support the target function attribute anyway. --- lib/x86/adler32_impl.h | 24 ++++-------------------- lib/x86/crc32_impl.h | 32 +++++--------------------------- lib/x86/decompress_impl.h | 4 +--- 3 files changed, 10 insertions(+), 50 deletions(-) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 564d6826..6aae8b8b 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -117,11 +117,7 @@ * would behave incorrectly. */ # define IMPL_MAX_CHUNK_LEN (32 * (0x7FFF / 0xFF)) -# if HAVE_SSE2_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("sse2") -# endif +# define ATTRIBUTES _target_attribute("sse2") static forceinline ATTRIBUTES void adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) { @@ -214,11 +210,7 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) # define IMPL_ALIGNMENT 32 # define IMPL_SEGMENT_LEN 64 # define IMPL_MAX_CHUNK_LEN (64 * (0x7FFF / 0xFF)) -# if HAVE_AVX2_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("avx2") -# endif +# define ATTRIBUTES _target_attribute("avx2") static forceinline ATTRIBUTES void adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) { @@ -286,11 +278,7 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) # define IMPL_ALIGNMENT 32 # define IMPL_SEGMENT_LEN 128 # define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN -# if HAVE_AVX2_NATIVE && HAVE_AVXVNNI_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("avx2,avxvnni") -# endif +# define ATTRIBUTES _target_attribute("avx2,avxvnni") static forceinline ATTRIBUTES void adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) @@ -362,11 +350,7 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, # define IMPL_ALIGNMENT 64 # define IMPL_SEGMENT_LEN 128 # define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN -# if HAVE_AVX512BW_NATIVE && HAVE_AVX512VNNI_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") -# endif +# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") static forceinline ATTRIBUTES void adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, u32 *s1, u32 *s2) diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index d50547bd..baee423a 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -34,11 +34,7 @@ #if HAVE_PCLMULQDQ_INTRIN # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq # define SUFFIX _pclmulqdq -# if HAVE_PCLMULQDQ_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("pclmul") -# endif +# define ATTRIBUTES _target_attribute("pclmul") # define VL 16 # define FOLD_LESSTHAN16BYTES 0 # define USE_TERNARYLOGIC 0 @@ -62,11 +58,7 @@ #if HAVE_PCLMULQDQ_INTRIN && HAVE_AVX_INTRIN # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx # define SUFFIX _pclmulqdq_avx -# if HAVE_PCLMULQDQ_NATIVE && HAVE_AVX_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("pclmul,avx") -# endif +# define ATTRIBUTES _target_attribute("pclmul,avx") # define VL 16 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 0 @@ -83,11 +75,7 @@ !(defined(_MSC_VER) && !defined(__clang__)) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 -# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX2_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") -# endif +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") # define VL 32 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 0 @@ -103,12 +91,7 @@ HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 -# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ - HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") -# endif +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") # define VL 32 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 1 @@ -120,12 +103,7 @@ HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 -# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ - HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") -# endif +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") # define VL 64 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 1 diff --git a/lib/x86/decompress_impl.h b/lib/x86/decompress_impl.h index a52305fa..341ba885 100644 --- a/lib/x86/decompress_impl.h +++ b/lib/x86/decompress_impl.h @@ -12,9 +12,7 @@ #if HAVE_BMI2_INTRIN # define deflate_decompress_bmi2 deflate_decompress_bmi2 # define FUNCNAME deflate_decompress_bmi2 -# if !HAVE_BMI2_NATIVE -# define ATTRIBUTES _target_attribute("bmi2") -# endif +# define ATTRIBUTES _target_attribute("bmi2") /* * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic From 032740f2650885cb27998c0a782c48125b3e7f3d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 12/15] lib/x86: remove unnecessary NATIVE macros Since most of the uses of the HAVE_*_NATIVE macros have been removed, and many of them provide no additional value over the original compiler-provided macro like __AVX2__ anyway, there's not much point in having them anymore. Remove them, except for HAVE_SSE2_NATIVE and HAVE_BMI2_NATIVE which are still worthwhile to have. --- lib/x86/cpu_features.h | 60 +++++++++++++++++--------------------- lib/x86/matchfinder_impl.h | 2 +- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index d19c8d34..36537c8d 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -58,18 +58,6 @@ #define X86_CPU_FEATURE_AVX512VNNI 0x00000400 #define X86_CPU_FEATURE_AVXVNNI 0x00000800 -#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) -#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) -#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX)) -#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) -#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) -#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F)) -#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW)) -#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL)) -#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ)) -#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI)) -#define HAVE_AVXVNNI(features) (HAVE_AVXVNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVXVNNI)) - #if HAVE_DYNAMIC_X86_CPU_FEATURES #define X86_CPU_FEATURES_KNOWN 0x80000000 extern volatile u32 libdeflate_x86_cpu_features; @@ -90,9 +78,11 @@ static inline u32 get_x86_cpu_features(void) { return 0; } #if defined(__SSE2__) || \ (defined(_MSC_VER) && \ (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) -# define HAVE_SSE2_NATIVE 1 +# define HAVE_SSE2(features) 1 +# define HAVE_SSE2_NATIVE 1 #else -# define HAVE_SSE2_NATIVE 0 +# define HAVE_SSE2(features) ((features) & X86_CPU_FEATURE_SSE2) +# define HAVE_SSE2_NATIVE 0 #endif #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_SSE2_INTRIN 1 @@ -102,9 +92,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* PCLMULQDQ */ #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) -# define HAVE_PCLMULQDQ_NATIVE 1 +# define HAVE_PCLMULQDQ(features) 1 #else -# define HAVE_PCLMULQDQ_NATIVE 0 +# define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ) #endif #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_PCLMULQDQ_INTRIN 1 @@ -114,9 +104,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* AVX */ #ifdef __AVX__ -# define HAVE_AVX_NATIVE 1 +# define HAVE_AVX(features) 1 #else -# define HAVE_AVX_NATIVE 0 +# define HAVE_AVX(features) ((features) & X86_CPU_FEATURE_AVX) #endif #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX_INTRIN 1 @@ -126,9 +116,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* AVX2 */ #ifdef __AVX2__ -# define HAVE_AVX2_NATIVE 1 +# define HAVE_AVX2(features) 1 #else -# define HAVE_AVX2_NATIVE 0 +# define HAVE_AVX2(features) ((features) & X86_CPU_FEATURE_AVX2) #endif #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX2_INTRIN 1 @@ -138,9 +128,11 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* BMI2 */ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) -# define HAVE_BMI2_NATIVE 1 +# define HAVE_BMI2(features) 1 +# define HAVE_BMI2_NATIVE 1 #else -# define HAVE_BMI2_NATIVE 0 +# define HAVE_BMI2(features) ((features) & X86_CPU_FEATURE_BMI2) +# define HAVE_BMI2_NATIVE 0 #endif #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define HAVE_BMI2_INTRIN 1 @@ -160,9 +152,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* AVX512F */ #ifdef __AVX512F__ -# define HAVE_AVX512F_NATIVE 1 +# define HAVE_AVX512F(features) 1 #else -# define HAVE_AVX512F_NATIVE 0 +# define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F) #endif #if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX512F_INTRIN 1 @@ -172,9 +164,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* AVX512BW */ #ifdef __AVX512BW__ -# define HAVE_AVX512BW_NATIVE 1 +# define HAVE_AVX512BW(features) 1 #else -# define HAVE_AVX512BW_NATIVE 0 +# define HAVE_AVX512BW(features) ((features) & X86_CPU_FEATURE_AVX512BW) #endif #if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX512BW_INTRIN 1 @@ -184,9 +176,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* AVX512VL */ #ifdef __AVX512VL__ -# define HAVE_AVX512VL_NATIVE 1 +# define HAVE_AVX512VL(features) 1 #else -# define HAVE_AVX512VL_NATIVE 0 +# define HAVE_AVX512VL(features) ((features) & X86_CPU_FEATURE_AVX512VL) #endif #if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) # define HAVE_AVX512VL_INTRIN 1 @@ -196,9 +188,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* VPCLMULQDQ */ #ifdef __VPCLMULQDQ__ -# define HAVE_VPCLMULQDQ_NATIVE 1 +# define HAVE_VPCLMULQDQ(features) 1 #else -# define HAVE_VPCLMULQDQ_NATIVE 0 +# define HAVE_VPCLMULQDQ(features) ((features) & X86_CPU_FEATURE_VPCLMULQDQ) #endif #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) # define HAVE_VPCLMULQDQ_INTRIN 1 @@ -208,9 +200,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* AVX512VNNI */ #ifdef __AVX512VNNI__ -# define HAVE_AVX512VNNI_NATIVE 1 +# define HAVE_AVX512VNNI(features) 1 #else -# define HAVE_AVX512VNNI_NATIVE 0 +# define HAVE_AVX512VNNI(features) ((features) & X86_CPU_FEATURE_AVX512VNNI) #endif #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) # define HAVE_AVX512VNNI_INTRIN 1 @@ -220,9 +212,9 @@ static inline u32 get_x86_cpu_features(void) { return 0; } /* AVX-VNNI */ #ifdef __AVXVNNI__ -# define HAVE_AVXVNNI_NATIVE 1 +# define HAVE_AVXVNNI(features) 1 #else -# define HAVE_AVXVNNI_NATIVE 0 +# define HAVE_AVXVNNI(features) ((features) & X86_CPU_FEATURE_AVXVNNI) #endif #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || defined(_MSC_VER) # define HAVE_AVXVNNI_INTRIN 1 diff --git a/lib/x86/matchfinder_impl.h b/lib/x86/matchfinder_impl.h index 21b463a7..37a4960a 100644 --- a/lib/x86/matchfinder_impl.h +++ b/lib/x86/matchfinder_impl.h @@ -30,7 +30,7 @@ #include "cpu_features.h" -#if HAVE_AVX2_NATIVE +#ifdef __AVX2__ static forceinline void matchfinder_init_avx2(mf_pos_t *data, size_t size) { From 5e002653abc44f8b63aaabd74140fb8ea9284a60 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 13/15] lib/x86: simplify checks for intrinsics Since all the x86 HAVE_*_INTRIN values are now, in practice, determined by compiler version checks, it's simpler to just fold these checks directly into the code that actually uses the intrinsics (*_impl.h). This allows many of the conditions to be merged together. This approach does mean that the conditions for using a particular type of intrinsic may get duplicated in different places as more functions get added. This is partly why I used the layer of indirection originally. But in practice this hasn't been much of a problem. For crc32_x86_vpclmulqdq_avx2() we also have to check the compiler version anyway because MSVC supports both VPCLMULQDQ and AVX2, but not together. --- lib/x86/adler32_impl.h | 23 ++++++------ lib/x86/cpu_features.h | 76 --------------------------------------- lib/x86/crc32_impl.h | 30 ++++++---------- lib/x86/decompress_impl.h | 16 ++++++--- 4 files changed, 31 insertions(+), 114 deletions(-) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 6aae8b8b..7d7ea9d4 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -104,8 +104,11 @@ (void)a, (void)b, (void)c, (void)d, (void)e, (void)f #endif -/* SSE2 implementation */ -#if HAVE_SSE2_INTRIN +/* + * SSE2 and AVX2 implementations. They are very similar; the AVX2 + * implementation just uses twice the vector width as the SSE2 one. + */ +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define adler32_sse2 adler32_sse2 # define FUNCNAME adler32_sse2 # define FUNCNAME_CHUNK adler32_sse2_chunk @@ -197,13 +200,7 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1); } # include "../adler32_vec_template.h" -#endif /* HAVE_SSE2_INTRIN */ -/* - * AVX2 implementation. Basically the same as the SSE2 one, but with the vector - * width doubled. - */ -#if HAVE_AVX2_INTRIN # define adler32_avx2 adler32_avx2 # define FUNCNAME adler32_avx2 # define FUNCNAME_CHUNK adler32_avx2_chunk @@ -264,14 +261,14 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1); } # include "../adler32_vec_template.h" -#endif /* HAVE_AVX2_INTRIN */ +#endif /* * AVX2/AVX-VNNI implementation. This is similar to the AVX512BW/AVX512VNNI * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI. * AVX-VNNI adds dot product instructions to CPUs without AVX-512. */ -#if HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN +#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || defined(_MSC_VER) # define adler32_avx2_vnni adler32_avx2_vnni # define FUNCNAME adler32_avx2_vnni # define FUNCNAME_CHUNK adler32_avx2_vnni_chunk @@ -337,13 +334,13 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1); } # include "../adler32_vec_template.h" -#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */ +#endif /* * AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product) * instruction from AVX512VNNI. */ -#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) # define adler32_avx512_vnni adler32_avx512_vnni # define FUNCNAME adler32_avx512_vnni # define FUNCNAME_CHUNK adler32_avx512_vnni_chunk @@ -398,7 +395,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0); } # include "../adler32_vec_template.h" -#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */ +#endif static inline adler32_func_t arch_select_adler32_func(void) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 36537c8d..a8159964 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -74,7 +74,6 @@ static inline u32 get_x86_cpu_features(void) static inline u32 get_x86_cpu_features(void) { return 0; } #endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */ -/* SSE2 */ #if defined(__SSE2__) || \ (defined(_MSC_VER) && \ (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) @@ -84,49 +83,25 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_SSE2(features) ((features) & X86_CPU_FEATURE_SSE2) # define HAVE_SSE2_NATIVE 0 #endif -#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_SSE2_INTRIN 1 -#else -# define HAVE_SSE2_INTRIN 0 -#endif -/* PCLMULQDQ */ #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) # define HAVE_PCLMULQDQ(features) 1 #else # define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ) #endif -#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_PCLMULQDQ_INTRIN 1 -#else -# define HAVE_PCLMULQDQ_INTRIN 0 -#endif -/* AVX */ #ifdef __AVX__ # define HAVE_AVX(features) 1 #else # define HAVE_AVX(features) ((features) & X86_CPU_FEATURE_AVX) #endif -#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_AVX_INTRIN 1 -#else -# define HAVE_AVX_INTRIN 0 -#endif -/* AVX2 */ #ifdef __AVX2__ # define HAVE_AVX2(features) 1 #else # define HAVE_AVX2(features) ((features) & X86_CPU_FEATURE_AVX2) #endif -#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_AVX2_INTRIN 1 -#else -# define HAVE_AVX2_INTRIN 0 -#endif -/* BMI2 */ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) # define HAVE_BMI2(features) 1 # define HAVE_BMI2_NATIVE 1 @@ -134,93 +109,42 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_BMI2(features) ((features) & X86_CPU_FEATURE_BMI2) # define HAVE_BMI2_NATIVE 0 #endif -#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_BMI2_INTRIN 1 -#else -# define HAVE_BMI2_INTRIN 0 -#endif -/* - * MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() - * intrinsics. It seems to be fixed in VS2022. - */ -#if defined(_MSC_VER) && _MSC_VER < 1930 /* older than VS2022 (toolset v143) */ -# undef HAVE_BMI2_NATIVE -# undef HAVE_BMI2_INTRIN -# define HAVE_BMI2_NATIVE 0 -# define HAVE_BMI2_INTRIN 0 -#endif -/* AVX512F */ #ifdef __AVX512F__ # define HAVE_AVX512F(features) 1 #else # define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F) #endif -#if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_AVX512F_INTRIN 1 -#else -# define HAVE_AVX512F_INTRIN 0 -#endif -/* AVX512BW */ #ifdef __AVX512BW__ # define HAVE_AVX512BW(features) 1 #else # define HAVE_AVX512BW(features) ((features) & X86_CPU_FEATURE_AVX512BW) #endif -#if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_AVX512BW_INTRIN 1 -#else -# define HAVE_AVX512BW_INTRIN 0 -#endif -/* AVX512VL */ #ifdef __AVX512VL__ # define HAVE_AVX512VL(features) 1 #else # define HAVE_AVX512VL(features) ((features) & X86_CPU_FEATURE_AVX512VL) #endif -#if GCC_PREREQ(5, 1) || defined(__clang__) || defined(_MSC_VER) -# define HAVE_AVX512VL_INTRIN 1 -#else -# define HAVE_AVX512VL_INTRIN 0 -#endif -/* VPCLMULQDQ */ #ifdef __VPCLMULQDQ__ # define HAVE_VPCLMULQDQ(features) 1 #else # define HAVE_VPCLMULQDQ(features) ((features) & X86_CPU_FEATURE_VPCLMULQDQ) #endif -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) -# define HAVE_VPCLMULQDQ_INTRIN 1 -#else -# define HAVE_VPCLMULQDQ_INTRIN 0 -#endif -/* AVX512VNNI */ #ifdef __AVX512VNNI__ # define HAVE_AVX512VNNI(features) 1 #else # define HAVE_AVX512VNNI(features) ((features) & X86_CPU_FEATURE_AVX512VNNI) #endif -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) -# define HAVE_AVX512VNNI_INTRIN 1 -#else -# define HAVE_AVX512VNNI_INTRIN 0 -#endif -/* AVX-VNNI */ #ifdef __AVXVNNI__ # define HAVE_AVXVNNI(features) 1 #else # define HAVE_AVXVNNI(features) ((features) & X86_CPU_FEATURE_AVXVNNI) #endif -#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || defined(_MSC_VER) -# define HAVE_AVXVNNI_INTRIN 1 -#else -# define HAVE_AVXVNNI_INTRIN 0 -#endif #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # include diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index baee423a..bbe95fe6 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -30,8 +30,8 @@ #include "cpu_features.h" +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) /* PCLMULQDQ implementation */ -#if HAVE_PCLMULQDQ_INTRIN # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq # define SUFFIX _pclmulqdq # define ATTRIBUTES _target_attribute("pclmul") @@ -39,7 +39,6 @@ # define FOLD_LESSTHAN16BYTES 0 # define USE_TERNARYLOGIC 0 # include "crc32_pclmul_template.h" -#endif /* * PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ @@ -51,11 +50,7 @@ * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient * handling of partial blocks. (We *could* compile a variant with * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.) - * - * FIXME: with MSVC, this isn't actually compiled with AVX code generation - * enabled yet. That would require that this be moved to its own .c file. */ -#if HAVE_PCLMULQDQ_INTRIN && HAVE_AVX_INTRIN # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx # define SUFFIX _pclmulqdq_avx # define ATTRIBUTES _target_attribute("pclmul,avx") @@ -65,14 +60,14 @@ # include "crc32_pclmul_template.h" #endif -/* VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors. */ -#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX2_INTRIN && \ - /* - * This has to be disabled on MSVC because MSVC has a bug where it - * incorrectly assumes that VPCLMULQDQ implies AVX-512: - * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest - */ \ - !(defined(_MSC_VER) && !defined(__clang__)) +/* + * VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors. + * + * Currently this can't be enabled with MSVC because MSVC has a bug where it + * incorrectly assumes that VPCLMULQDQ implies AVX-512: + * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest + */ +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") @@ -82,13 +77,12 @@ # include "crc32_pclmul_template.h" #endif +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) /* * VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit. * This can be useful on CPUs where 512-bit vectors cause downclocking. */ -#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \ - HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") @@ -96,11 +90,8 @@ # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 1 # include "crc32_pclmul_template.h" -#endif /* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */ -#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \ - HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") @@ -110,7 +101,6 @@ # include "crc32_pclmul_template.h" #endif -/* Choose the best implementation at runtime. */ static inline crc32_func_t arch_select_crc32_func(void) { diff --git a/lib/x86/decompress_impl.h b/lib/x86/decompress_impl.h index 341ba885..daedcf2d 100644 --- a/lib/x86/decompress_impl.h +++ b/lib/x86/decompress_impl.h @@ -4,12 +4,18 @@ #include "cpu_features.h" /* - * BMI2 optimized version + * BMI2 optimized decompression function. * - * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation - * enabled yet. That would require that this be moved to its own .c file. + * With gcc and clang we just compile the whole function with + * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically. + * + * With MSVC, there is no target function attribute, but it's still possible to + * use bmi2 intrinsics explicitly. Currently we mostly don't, but there's a + * case in which we do (see below), so we at least take advantage of that. + * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() + * intrinsics. It seems to be fixed in VS2022. Hence, use MSVC_PREREQ(1930). */ -#if HAVE_BMI2_INTRIN +#if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930) # define deflate_decompress_bmi2 deflate_decompress_bmi2 # define FUNCNAME deflate_decompress_bmi2 # define ATTRIBUTES _target_attribute("bmi2") @@ -31,7 +37,7 @@ # endif # endif # include "../decompress_template.h" -#endif /* HAVE_BMI2_INTRIN */ +#endif #if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE #define DEFAULT_IMPL deflate_decompress_bmi2 From 962735b9341efa1a0c3308bf498104b611e42f6b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 14/15] lib/x86: adding missing MSVC and Apple clang version checks Most of the recently added code that uses newer x86 intrinsics (AVX-512, VPCLMULQDQ, AVX-VNNI) is being compiled unconditionally when the compiler is MSVC or Apple clang. This is inconsistent with the minimum versions of these compilers that we claim to support. Therefore, add the needed checks against _MSC_VER and __apple_build_version__. --- lib/x86/adler32_impl.h | 4 ++-- lib/x86/crc32_impl.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 7d7ea9d4..7b3f02ac 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -268,7 +268,7 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI. * AVX-VNNI adds dot product instructions to CPUs without AVX-512. */ -#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || defined(_MSC_VER) +#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930) # define adler32_avx2_vnni adler32_avx2_vnni # define FUNCNAME adler32_avx2_vnni # define FUNCNAME_CHUNK adler32_avx2_vnni_chunk @@ -340,7 +340,7 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, * AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product) * instruction from AVX512VNNI. */ -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) # define adler32_avx512_vnni adler32_avx512_vnni # define FUNCNAME adler32_avx512_vnni # define FUNCNAME_CHUNK adler32_avx512_vnni_chunk diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index bbe95fe6..3d8e254d 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -67,7 +67,7 @@ * incorrectly assumes that VPCLMULQDQ implies AVX-512: * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest */ -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") @@ -77,7 +77,7 @@ # include "crc32_pclmul_template.h" #endif -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || defined(_MSC_VER) +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) /* * VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit. From 6d307081c62a60636589d3d9a37b412c409a1256 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 1 Mar 2024 20:49:08 -0800 Subject: [PATCH 15/15] lib: make lists of CPU feature bits easier to read --- lib/arm/cpu_features.h | 12 ++++++------ lib/x86/cpu_features.h | 26 +++++++++++++------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index 4fa6e43a..30920c6b 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -43,11 +43,11 @@ # define HAVE_DYNAMIC_ARM_CPU_FEATURES 1 #endif -#define ARM_CPU_FEATURE_NEON 0x00000001 -#define ARM_CPU_FEATURE_PMULL 0x00000002 -#define ARM_CPU_FEATURE_CRC32 0x00000004 -#define ARM_CPU_FEATURE_SHA3 0x00000008 -#define ARM_CPU_FEATURE_DOTPROD 0x00000010 +#define ARM_CPU_FEATURE_NEON (1 << 0) +#define ARM_CPU_FEATURE_PMULL (1 << 1) +#define ARM_CPU_FEATURE_CRC32 (1 << 2) +#define ARM_CPU_FEATURE_SHA3 (1 << 3) +#define ARM_CPU_FEATURE_DOTPROD (1 << 4) #define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON)) #define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL)) @@ -56,7 +56,7 @@ #define HAVE_DOTPROD(features) (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD)) #if HAVE_DYNAMIC_ARM_CPU_FEATURES -#define ARM_CPU_FEATURES_KNOWN 0x80000000 +#define ARM_CPU_FEATURES_KNOWN (1U << 31) extern volatile u32 libdeflate_arm_cpu_features; void libdeflate_init_arm_cpu_features(void); diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index a8159964..b4c00118 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -39,27 +39,27 @@ # define HAVE_DYNAMIC_X86_CPU_FEATURES 1 #endif -#define X86_CPU_FEATURE_SSE2 0x00000001 -#define X86_CPU_FEATURE_PCLMULQDQ 0x00000002 -#define X86_CPU_FEATURE_AVX 0x00000004 -#define X86_CPU_FEATURE_AVX2 0x00000008 -#define X86_CPU_FEATURE_BMI2 0x00000010 +#define X86_CPU_FEATURE_SSE2 (1 << 0) +#define X86_CPU_FEATURE_PCLMULQDQ (1 << 1) +#define X86_CPU_FEATURE_AVX (1 << 2) +#define X86_CPU_FEATURE_AVX2 (1 << 3) +#define X86_CPU_FEATURE_BMI2 (1 << 4) /* * ZMM indicates whether 512-bit vectors (zmm registers) should be used. On * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU * supports it, i.e. even if AVX512F is set. On these CPUs, we may still use * AVX-512 instructions, but only with ymm and xmm registers. */ -#define X86_CPU_FEATURE_ZMM 0x00000020 -#define X86_CPU_FEATURE_AVX512F 0x00000040 -#define X86_CPU_FEATURE_AVX512BW 0x00000080 -#define X86_CPU_FEATURE_AVX512VL 0x00000100 -#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200 -#define X86_CPU_FEATURE_AVX512VNNI 0x00000400 -#define X86_CPU_FEATURE_AVXVNNI 0x00000800 +#define X86_CPU_FEATURE_ZMM (1 << 5) +#define X86_CPU_FEATURE_AVX512F (1 << 6) +#define X86_CPU_FEATURE_AVX512BW (1 << 7) +#define X86_CPU_FEATURE_AVX512VL (1 << 8) +#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 9) +#define X86_CPU_FEATURE_AVX512VNNI (1 << 10) +#define X86_CPU_FEATURE_AVXVNNI (1 << 11) #if HAVE_DYNAMIC_X86_CPU_FEATURES -#define X86_CPU_FEATURES_KNOWN 0x80000000 +#define X86_CPU_FEATURES_KNOWN (1U << 31) extern volatile u32 libdeflate_x86_cpu_features; void libdeflate_init_x86_cpu_features(void);