From a32774467a9acee4a8e3f0c9bae365d6706bbef7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 21 Feb 2024 22:47:06 -0800 Subject: [PATCH] lib/x86/adler32: add an AVX-512 implementation libdeflate used to (before commit 416bac37a0c4) have an AVX512BW implementation of Adler-32, but I removed it due to AVX-512's downclocking issues. Since then, newer Intel and AMD CPUs have come out with better AVX-512 implementations, and these CPUs tend to have AVX512VNNI which includes a dot product instruction which is useful for Adler-32. Therefore, add an AVX512VNNI/AVX512BW implementation. --- lib/x86/adler32_impl.h | 103 +++++++++++++++++++++++++++++++-- lib/x86/cpu_features.c | 6 ++ lib/x86/cpu_features.h | 34 ++++++++++- scripts/checksum_benchmarks.sh | 5 ++ scripts/run_tests.sh | 4 +- 5 files changed, 144 insertions(+), 8 deletions(-) diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 6285dc80..79888ae4 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -67,6 +67,19 @@ ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \ } +#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \ +{ \ + __m256i /* __v8su */ s1_256bit, s2_256bit; \ + \ + /* 512 => 256 bits */ \ + s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \ + _mm512_extracti64x4_epi64((v_s1), 1)); \ + s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \ + _mm512_extracti64x4_epi64((v_s2), 1)); \ + \ + ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \ +} + /* * This is a very silly partial workaround for gcc bug * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to @@ -263,14 +276,97 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) # include "../adler32_vec_template.h" #endif /* HAVE_AVX2_INTRIN */ -#if defined(adler32_avx2) && HAVE_AVX2_NATIVE -#define DEFAULT_IMPL adler32_avx2 -#else +/* + * AVX512VNNI/AVX512BW implementation. Uses the dot product instruction + * vpdpbusd from AVX512VNNI and the vpsadbw instruction from AVX512BW. + * (vpdpbusd with ones could be used instead of vpsadbw, but it's slower.) + * + * We currently don't include an implementation using AVX512VNNI or AVX512BW + * alone, since CPUs with good AVX-512 implementations tend to have both. + */ +#if HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN +# define adler32_avx512vnni_avx512bw adler32_avx512vnni_avx512bw +# define FUNCNAME adler32_avx512vnni_avx512bw +# define FUNCNAME_CHUNK adler32_avx512vnni_avx512bw_chunk +# define IMPL_ALIGNMENT 64 +# define IMPL_SEGMENT_LEN 128 +# define IMPL_MAX_CHUNK_LEN (128 * (0x7FFF / 0xFF)) +# if HAVE_AVX512VNNI_NATIVE && HAVE_AVX512BW_NATIVE +# define ATTRIBUTES +# else +# define ATTRIBUTES _target_attribute("avx512vnni,avx512bw") +# endif +# include + /* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +# if defined(__clang__) && defined(_MSC_VER) +# include +# include +# include +# include +# include +# include +# include +# include +# endif +static forceinline ATTRIBUTES void +adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end, + u32 *s1, u32 *s2) +{ + static const u8 _aligned_attribute(64) raw_mults[64] = { + 64, 63, 62, 61, 60, 59, 58, 57, + 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, + 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1, + }; + const __m512i zeroes = _mm512_setzero_si512(); + const __m512i /* __v64qu */ mults = _mm512_loadu_si512(raw_mults); + __m512i /* __v16su */ v_s1 = zeroes; + __m512i /* __v16su */ v_s1_sums_a = zeroes; + __m512i /* __v16su */ v_s1_sums_b = zeroes; + __m512i /* __v16su */ v_s2_a = zeroes; + __m512i /* __v16su */ v_s2_b = zeroes; + + do { + const __m512i bytes_a = *p++; + const __m512i bytes_b = *p++; + __m512i v_sad_a, v_sad_b; + + v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults); + v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults); + v_sad_a = _mm512_sad_epu8(bytes_a, zeroes); + v_sad_b = _mm512_sad_epu8(bytes_b, zeroes); + v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1); + v_s1 = _mm512_add_epi32(v_s1, v_sad_a); + v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1); + v_s1 = _mm512_add_epi32(v_s1, v_sad_b); + } while (p != end); + + v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b); + v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 6); + v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b); + v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a); + ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2_a); +} +# include "../adler32_vec_template.h" +#endif /* HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN */ + static inline adler32_func_t arch_select_adler32_func(void) { const u32 features MAYBE_UNUSED = get_x86_cpu_features(); +#ifdef adler32_avx512vnni_avx512bw + if ((features & X86_CPU_FEATURE_ZMM) && + HAVE_AVX512VNNI(features) && HAVE_AVX512BW(features)) + return adler32_avx512vnni_avx512bw; +#endif #ifdef adler32_avx2 if (HAVE_AVX2(features)) return adler32_avx2; @@ -282,6 +378,5 @@ arch_select_adler32_func(void) return NULL; } #define arch_select_adler32_func arch_select_adler32_func -#endif #endif /* LIB_X86_ADLER32_IMPL_H */ diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index ba7a39ff..750bf66c 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -92,8 +92,10 @@ static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_BMI2, "bmi2"}, {X86_CPU_FEATURE_ZMM, "zmm"}, {X86_CPU_FEATURE_AVX512F, "avx512f"}, + {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, + {X86_CPU_FEATURE_AVX512VNNI, "avx512vnni"}, }; volatile u32 libdeflate_x86_cpu_features = 0; @@ -171,10 +173,14 @@ void libdeflate_init_x86_cpu_features(void) features |= X86_CPU_FEATURE_ZMM; if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512F; + if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512BW; if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512VL; if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6)) features |= X86_CPU_FEATURE_VPCLMULQDQ; + if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512VNNI; out: disable_cpu_features_for_testing(&features, x86_cpu_feature_table, diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 6f68e032..bd725416 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -52,8 +52,10 @@ */ #define X86_CPU_FEATURE_ZMM 0x00000020 #define X86_CPU_FEATURE_AVX512F 0x00000040 -#define X86_CPU_FEATURE_AVX512VL 0x00000080 -#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000100 +#define X86_CPU_FEATURE_AVX512BW 0x00000080 +#define X86_CPU_FEATURE_AVX512VL 0x00000100 +#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200 +#define X86_CPU_FEATURE_AVX512VNNI 0x00000400 #define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) @@ -61,8 +63,10 @@ #define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) #define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) #define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F)) +#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW)) #define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL)) #define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ)) +#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI)) #if HAVE_DYNAMIC_X86_CPU_FEATURES #define X86_CPU_FEATURES_KNOWN 0x80000000 @@ -182,6 +186,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_AVX512F_INTRIN 0 #endif +/* AVX-512BW */ +#ifdef __AVX512BW__ +# define HAVE_AVX512BW_NATIVE 1 +#else +# define HAVE_AVX512BW_NATIVE 0 +#endif +#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \ + defined(_MSC_VER) +# define HAVE_AVX512BW_INTRIN 1 +#else +# define HAVE_AVX512BW_INTRIN 0 +#endif + /* AVX-512VL */ #ifdef __AVX512VL__ # define HAVE_AVX512VL_NATIVE 1 @@ -208,6 +225,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_VPCLMULQDQ_INTRIN 0 #endif +/* AVX512VNNI */ +#ifdef __AVX512VNNI__ +# define HAVE_AVX512VNNI_NATIVE 1 +#else +# define HAVE_AVX512VNNI_NATIVE 0 +#endif +#if HAVE_AVX512VNNI_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ + defined(_MSC_VER)) +# define HAVE_AVX512VNNI_INTRIN 1 +#else +# define HAVE_AVX512VNNI_INTRIN 0 +#endif + #endif /* ARCH_X86_32 || ARCH_X86_64 */ #endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index b9c38698..8833b08a 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -157,6 +157,11 @@ echo { case $ARCH in i386|x86_64) + if have_cpu_feature avx512vnni avx512bw; then + do_benchmark "AVX512VNNI/AVX512BW" + disable_cpu_feature "avx512bw" "-mno-avx512bw" + disable_cpu_feature "avx512vnni" "-mno-avx512vnni" + fi if have_cpu_feature avx2; then do_benchmark "AVX2" disable_cpu_feature "avx2" "-mno-avx2" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index b07f71a7..9bca9677 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -142,8 +142,8 @@ build_and_run_tests() if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then case "$ARCH" in i386|x86_64) - features+=(zmm vpclmulqdq avx512vl avx512f - avx2 avx bmi2 pclmulqdq sse2) + features+=(zmm avx512vnni vpclmulqdq avx512vl avx512bw + avx512f avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*) features+=(dotprod sha3 crc32 pmull neon)