From d4bd6b294cd4a2741a03e7a26a9621eeeeb1ae1f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 21 Feb 2024 00:17:43 -0800 Subject: [PATCH] lib/x86/adler32: add back an AVX-512BW implementation libdeflate used to (before commit 416bac37a0c4) have an AVX-512BW implementation of Adler-32, but I removed it due to AVX-512's downclocking issues. Since then, newer Intel and AMD CPUs have come out with better AVX-512 implementations. I also recently added AVX-512 implementations of CRC-32. An exclusion list is used to prevent 512-bit vectors from being used on older Intel CPUs. Therefore, add back an AVX-512BW implementation of Adler32. Unlike the original implementation, I went with the unpack-based approach, for consistency with the current adler32_avx2(). The new code is also MSVC-compatible. --- lib/x86/adler32_impl.h | 107 +++++++++++++++++++++++++++++++-- lib/x86/cpu_features.c | 3 + lib/x86/cpu_features.h | 19 +++++- scripts/checksum_benchmarks.sh | 4 ++ scripts/run_tests.sh | 2 +- 5 files changed, 128 insertions(+), 7 deletions(-) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 6285dc80..c018dc6f 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -67,6 +67,19 @@ ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \ } +#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \ +{ \ + __m256i /* __v8su */ s1_256bit, s2_256bit; \ + \ + /* 512 => 256 bits */ \ + s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \ + _mm512_extracti64x4_epi64((v_s1), 1)); \ + s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \ + _mm512_extracti64x4_epi64((v_s2), 1)); \ + \ + ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \ +} + /* * This is a very silly partial workaround for gcc bug * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to @@ -263,14 +276,101 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) # include "../adler32_vec_template.h" #endif /* HAVE_AVX2_INTRIN */ -#if defined(adler32_avx2) && HAVE_AVX2_NATIVE -#define DEFAULT_IMPL adler32_avx2 -#else +/* AVX-512BW implementation */ +#if HAVE_AVX512BW_INTRIN +# define adler32_avx512bw adler32_avx512bw +# define FUNCNAME adler32_avx512bw +# define FUNCNAME_CHUNK adler32_avx512bw_chunk +# define IMPL_ALIGNMENT 64 +# define IMPL_SEGMENT_LEN 128 +# define IMPL_MAX_CHUNK_LEN (128 * (0x7FFF / 0xFF)) +# if HAVE_AVX512BW_NATIVE +# define ATTRIBUTES +# else +# define ATTRIBUTES _target_attribute("avx512bw") +# endif +# include + /* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +# if defined(__clang__) && defined(_MSC_VER) +# include +# include +# include +# include +# include +# include +# include +# endif +static forceinline ATTRIBUTES void +adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end, + u32 *s1, u32 *s2) +{ + static const u16 _aligned_attribute(64) mults[128] = { + 128, 127, 126, 125, 124, 123, 122, 121, 112, 111, 110, 109, 108, 107, 106, 105, + 96, 95, 94, 93, 92, 91, 90, 89, 80, 79, 78, 77, 76, 75, 74, 73, + + 120, 119, 118, 117, 116, 115, 114, 113, 104, 103, 102, 101, 100, 99, 98, 97, + 88, 87, 86, 85, 84, 83, 82, 81, 72, 71, 70, 69, 68, 67, 66, 65, + + 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41, + 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9, + + 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33, + 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const __m512i zeroes = _mm512_setzero_si512(); + const __m512i /* __v32hu */ mults_a = _mm512_loadu_si512(&mults[0]); + const __m512i /* __v32hu */ mults_b = _mm512_loadu_si512(&mults[32]); + const __m512i /* __v32hu */ mults_c = _mm512_loadu_si512(&mults[64]); + const __m512i /* __v32hu */ mults_d = _mm512_loadu_si512(&mults[96]); + __m512i /* __v16su */ v_s1 = zeroes; + __m512i /* __v16su */ v_s2 = zeroes; + __m512i /* __v32hu */ v_byte_sums_a = zeroes; + __m512i /* __v32hu */ v_byte_sums_b = zeroes; + __m512i /* __v32hu */ v_byte_sums_c = zeroes; + __m512i /* __v32hu */ v_byte_sums_d = zeroes; + + do { + const __m512i bytes1 = *p++; + const __m512i bytes2 = *p++; + + v_s2 = _mm512_add_epi32(v_s2, v_s1); + v_s1 = _mm512_add_epi32(v_s1, _mm512_sad_epu8(bytes1, zeroes)); + v_s1 = _mm512_add_epi32(v_s1, _mm512_sad_epu8(bytes2, zeroes)); + v_byte_sums_a = _mm512_add_epi16( + v_byte_sums_a, _mm512_unpacklo_epi8(bytes1, zeroes)); + v_byte_sums_b = _mm512_add_epi16( + v_byte_sums_b, _mm512_unpackhi_epi8(bytes1, zeroes)); + v_byte_sums_c = _mm512_add_epi16( + v_byte_sums_c, _mm512_unpacklo_epi8(bytes2, zeroes)); + v_byte_sums_d = _mm512_add_epi16( + v_byte_sums_d, _mm512_unpackhi_epi8(bytes2, zeroes)); + + GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b, + v_byte_sums_c, v_byte_sums_d); + } while (p != end); + + v_s2 = _mm512_slli_epi32(v_s2, 7); + v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_a, mults_a)); + v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_b, mults_b)); + v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_c, mults_c)); + v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_d, mults_d)); + ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2); +} +# include "../adler32_vec_template.h" +#endif /* HAVE_AVX512BW_INTRIN */ + static inline adler32_func_t arch_select_adler32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); +#ifdef adler32_avx512bw + if ((features & X86_CPU_FEATURE_ZMM) && HAVE_AVX512BW(features)) + return adler32_avx512bw; +#endif #ifdef adler32_avx2 if (HAVE_AVX2(features)) return adler32_avx2; @@ -282,6 +382,5 @@ arch_select_adler32_func(void) return NULL; } #define arch_select_adler32_func arch_select_adler32_func -#endif #endif /* LIB_X86_ADLER32_IMPL_H */ diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index ba7a39ff..193f3409 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -92,6 +92,7 @@ static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_BMI2, "bmi2"}, {X86_CPU_FEATURE_ZMM, "zmm"}, {X86_CPU_FEATURE_AVX512F, "avx512f"}, + {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, }; @@ -171,6 +172,8 @@ void libdeflate_init_x86_cpu_features(void) features |= X86_CPU_FEATURE_ZMM; if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512F; + if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512BW; if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512VL; if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6)) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 6f68e032..852b16da 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -52,8 +52,9 @@ */ #define X86_CPU_FEATURE_ZMM 0x00000020 #define X86_CPU_FEATURE_AVX512F 0x00000040 -#define X86_CPU_FEATURE_AVX512VL 0x00000080 -#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000100 +#define X86_CPU_FEATURE_AVX512BW 0x00000080 +#define X86_CPU_FEATURE_AVX512VL 0x00000100 +#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200 #define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) @@ -61,6 +62,7 @@ #define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) #define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) #define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F)) +#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW)) #define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL)) #define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ)) @@ -182,6 +184,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_AVX512F_INTRIN 0 #endif +/* AVX-512BW */ +#ifdef __AVX512BW__ +# define HAVE_AVX512BW_NATIVE 1 +#else +# define HAVE_AVX512BW_NATIVE 0 +#endif +#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \ + defined(_MSC_VER) +# define HAVE_AVX512BW_INTRIN 1 +#else +# define HAVE_AVX512BW_INTRIN 0 +#endif + /* AVX-512VL */ #ifdef __AVX512VL__ # define HAVE_AVX512VL_NATIVE 1 diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index b9c38698..eb1de5d5 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -157,6 +157,10 @@ echo { case $ARCH in i386|x86_64) + if have_cpu_feature avx512bw; then + do_benchmark "AVX512BW" + disable_cpu_feature "avx512bw" "-mno-avx512bw" + fi if have_cpu_feature avx2; then do_benchmark "AVX2" disable_cpu_feature "avx2" "-mno-avx2" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index b07f71a7..7b5f7465 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,7 +142,7 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(zmm vpclmulqdq avx512vl avx512f + features+=(zmm vpclmulqdq avx512vl avx512bw avx512f avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*)