From d4bd6b294cd4a2741a03e7a26a9621eeeeb1ae1f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Wed, 21 Feb 2024 00:17:43 -0800
Subject: [PATCH] lib/x86/adler32: add back an AVX-512BW implementation

libdeflate used to (before commit 416bac37a0c4) have an AVX-512BW
implementation of Adler-32, but I removed it due to AVX-512's
downclocking issues.  Since then, newer Intel and AMD CPUs have come out
with better AVX-512 implementations.  I also recently added AVX-512
implementations of CRC-32.  An exclusion list is used to prevent 512-bit
vectors from being used on older Intel CPUs.  Therefore, add back an
AVX-512BW implementation of Adler32.  Unlike the original
implementation, I went with the unpack-based approach, for consistency
with the current adler32_avx2().  The new code is also MSVC-compatible.
---
 lib/x86/adler32_impl.h         | 107 +++++++++++++++++++++++++++++++--
 lib/x86/cpu_features.c         |   3 +
 lib/x86/cpu_features.h         |  19 +++++-
 scripts/checksum_benchmarks.sh |   4 ++
 scripts/run_tests.sh           |   2 +-
 5 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
index 6285dc80..c018dc6f 100644
--- a/lib/x86/adler32_impl.h
+++ b/lib/x86/adler32_impl.h
@@ -67,6 +67,19 @@
 	ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit);	    \
 }
 
+#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2)		    \
+{									    \
+	__m256i /* __v8su */ s1_256bit, s2_256bit;			    \
+									    \
+	/* 512 => 256 bits */						    \
+	s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0),  \
+				     _mm512_extracti64x4_epi64((v_s1), 1)); \
+	s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0),  \
+				     _mm512_extracti64x4_epi64((v_s2), 1)); \
+									    \
+	ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit);	    \
+}
+
 /*
  * This is a very silly partial workaround for gcc bug
  * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892.  The bug causes gcc to
@@ -263,14 +276,101 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 #  include "../adler32_vec_template.h"
 #endif /* HAVE_AVX2_INTRIN */
 
-#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
-#define DEFAULT_IMPL	adler32_avx2
-#else
+/* AVX-512BW implementation */
+#if HAVE_AVX512BW_INTRIN
+#  define adler32_avx512bw	adler32_avx512bw
+#  define FUNCNAME		adler32_avx512bw
+#  define FUNCNAME_CHUNK	adler32_avx512bw_chunk
+#  define IMPL_ALIGNMENT	64
+#  define IMPL_SEGMENT_LEN	128
+#  define IMPL_MAX_CHUNK_LEN	(128 * (0x7FFF / 0xFF))
+#  if HAVE_AVX512BW_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("avx512bw")
+#  endif
+#  include <immintrin.h>
+  /*
+   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
+   * including some sub-headers.
+   */
+#  if defined(__clang__) && defined(_MSC_VER)
+#    include <tmmintrin.h>
+#    include <smmintrin.h>
+#    include <wmmintrin.h>
+#    include <avxintrin.h>
+#    include <avx2intrin.h>
+#    include <avx512fintrin.h>
+#    include <avx512bwintrin.h>
+#  endif
+static forceinline ATTRIBUTES void
+adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
+		       u32 *s1, u32 *s2)
+{
+	static const u16 _aligned_attribute(64) mults[128] = {
+		128, 127, 126, 125, 124, 123, 122, 121, 112, 111, 110, 109, 108, 107, 106, 105,
+		 96,  95,  94,  93,  92,  91,  90,  89,  80,  79,  78,  77,  76,  75,  74,  73,
+
+		120, 119, 118, 117, 116, 115, 114, 113, 104, 103, 102, 101, 100,  99,  98,  97,
+		 88,  87,  86,  85,  84,  83,  82,  81,  72,  71,  70,  69,  68,  67,  66,  65,
+
+		 64,  63,  62,  61,  60,  59,  58,  57,  48,  47,  46,  45,  44,  43,  42,  41,
+		 32,  31,  30,  29,  28,  27,  26,  25,  16,  15,  14,  13,  12,  11,  10,   9,
+
+		 56,  55,  54,  53,  52,  51,  50,  49,  40,  39,  38,  37,  36,  35,  34,  33,
+		 24,  23,  22,  21,  20,  19,  18,  17,   8,   7,   6,   5,   4,   3,   2,   1,
+	};
+	const __m512i zeroes = _mm512_setzero_si512();
+	const __m512i /* __v32hu */ mults_a = _mm512_loadu_si512(&mults[0]);
+	const __m512i /* __v32hu */ mults_b = _mm512_loadu_si512(&mults[32]);
+	const __m512i /* __v32hu */ mults_c = _mm512_loadu_si512(&mults[64]);
+	const __m512i /* __v32hu */ mults_d = _mm512_loadu_si512(&mults[96]);
+	__m512i /* __v16su */ v_s1 = zeroes;
+	__m512i /* __v16su */ v_s2 = zeroes;
+	__m512i /* __v32hu */ v_byte_sums_a = zeroes;
+	__m512i /* __v32hu */ v_byte_sums_b = zeroes;
+	__m512i /* __v32hu */ v_byte_sums_c = zeroes;
+	__m512i /* __v32hu */ v_byte_sums_d = zeroes;
+
+	do {
+		const __m512i bytes1 = *p++;
+		const __m512i bytes2 = *p++;
+
+		v_s2 = _mm512_add_epi32(v_s2, v_s1);
+		v_s1 = _mm512_add_epi32(v_s1, _mm512_sad_epu8(bytes1, zeroes));
+		v_s1 = _mm512_add_epi32(v_s1, _mm512_sad_epu8(bytes2, zeroes));
+		v_byte_sums_a = _mm512_add_epi16(
+			v_byte_sums_a, _mm512_unpacklo_epi8(bytes1, zeroes));
+		v_byte_sums_b = _mm512_add_epi16(
+			v_byte_sums_b, _mm512_unpackhi_epi8(bytes1, zeroes));
+		v_byte_sums_c = _mm512_add_epi16(
+			v_byte_sums_c, _mm512_unpacklo_epi8(bytes2, zeroes));
+		v_byte_sums_d = _mm512_add_epi16(
+			v_byte_sums_d, _mm512_unpackhi_epi8(bytes2, zeroes));
+
+		GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
+				v_byte_sums_c, v_byte_sums_d);
+	} while (p != end);
+
+	v_s2 = _mm512_slli_epi32(v_s2, 7);
+	v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_a, mults_a));
+	v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_b, mults_b));
+	v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_c, mults_c));
+	v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_d, mults_d));
+	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* HAVE_AVX512BW_INTRIN */
+
 static inline adler32_func_t
 arch_select_adler32_func(void)
 {
 	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
 
+#ifdef adler32_avx512bw
+	if ((features & X86_CPU_FEATURE_ZMM) && HAVE_AVX512BW(features))
+		return adler32_avx512bw;
+#endif
 #ifdef adler32_avx2
 	if (HAVE_AVX2(features))
 		return adler32_avx2;
@@ -282,6 +382,5 @@ arch_select_adler32_func(void)
 	return NULL;
 }
 #define arch_select_adler32_func	arch_select_adler32_func
-#endif
 
 #endif /* LIB_X86_ADLER32_IMPL_H */
diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
index ba7a39ff..193f3409 100644
--- a/lib/x86/cpu_features.c
+++ b/lib/x86/cpu_features.c
@@ -92,6 +92,7 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_BMI2,		"bmi2"},
 	{X86_CPU_FEATURE_ZMM,		"zmm"},
 	{X86_CPU_FEATURE_AVX512F,	"avx512f"},
+	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
 	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
 	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
 };
@@ -171,6 +172,8 @@ void libdeflate_init_x86_cpu_features(void)
 		features |= X86_CPU_FEATURE_ZMM;
 	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512F;
+	if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
+		features |= X86_CPU_FEATURE_AVX512BW;
 	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512VL;
 	if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
index 6f68e032..852b16da 100644
--- a/lib/x86/cpu_features.h
+++ b/lib/x86/cpu_features.h
@@ -52,8 +52,9 @@
  */
 #define X86_CPU_FEATURE_ZMM		0x00000020
 #define X86_CPU_FEATURE_AVX512F		0x00000040
-#define X86_CPU_FEATURE_AVX512VL	0x00000080
-#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000100
+#define X86_CPU_FEATURE_AVX512BW	0x00000080
+#define X86_CPU_FEATURE_AVX512VL	0x00000100
+#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000200
 
 #define HAVE_SSE2(features)	(HAVE_SSE2_NATIVE     || ((features) & X86_CPU_FEATURE_SSE2))
 #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
@@ -61,6 +62,7 @@
 #define HAVE_AVX2(features)	(HAVE_AVX2_NATIVE     || ((features) & X86_CPU_FEATURE_AVX2))
 #define HAVE_BMI2(features)	(HAVE_BMI2_NATIVE     || ((features) & X86_CPU_FEATURE_BMI2))
 #define HAVE_AVX512F(features)	(HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
+#define HAVE_AVX512BW(features)	(HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW))
 #define HAVE_AVX512VL(features)	(HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
 #define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))
 
@@ -182,6 +184,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_AVX512F_INTRIN	0
 #endif
 
+/* AVX-512BW */
+#ifdef __AVX512BW__
+#  define HAVE_AVX512BW_NATIVE	1
+#else
+#  define HAVE_AVX512BW_NATIVE	0
+#endif
+#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \
+			    defined(_MSC_VER)
+#  define HAVE_AVX512BW_INTRIN	1
+#else
+#  define HAVE_AVX512BW_INTRIN	0
+#endif
+
 /* AVX-512VL */
 #ifdef __AVX512VL__
 #  define HAVE_AVX512VL_NATIVE	1
diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh
index b9c38698..eb1de5d5 100755
--- a/scripts/checksum_benchmarks.sh
+++ b/scripts/checksum_benchmarks.sh
@@ -157,6 +157,10 @@ echo
 {
 case $ARCH in
 i386|x86_64)
+	if have_cpu_feature avx512bw; then
+		do_benchmark "AVX512BW"
+		disable_cpu_feature "avx512bw" "-mno-avx512bw"
+	fi
 	if have_cpu_feature avx2; then
 		do_benchmark "AVX2"
 		disable_cpu_feature "avx2" "-mno-avx2"
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index b07f71a7..7b5f7465 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -142,7 +142,7 @@ build_and_run_tests()
 	if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
 		case "$ARCH" in
 		i386|x86_64)
-			features+=(zmm vpclmulqdq avx512vl avx512f
+			features+=(zmm vpclmulqdq avx512vl avx512bw avx512f
 				   avx2 avx bmi2 pclmulqdq sse2)
 			;;
 		arm*|aarch*)