Skip to content

Commit

Permalink
lib/x86/adler32: add an AVX-512 implementation
Browse files Browse the repository at this point in the history
libdeflate used to (before commit 416bac3) have an AVX512BW
implementation of Adler-32, but I removed it due to AVX-512's
downclocking issues.  Since then, newer Intel and AMD CPUs have come out
with better AVX-512 implementations, and these CPUs tend to have
AVX512VNNI which includes a dot product instruction which is useful for
Adler-32.  Therefore, add an AVX512VNNI/AVX512BW implementation.
  • Loading branch information
ebiggers committed Feb 24, 2024
1 parent 5b217a1 commit a026a04
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 31 deletions.
141 changes: 116 additions & 25 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
}

#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
{ \
__m256i /* __v8su */ s1_256bit, s2_256bit; \
\
/* 512 => 256 bits */ \
s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \
_mm512_extracti64x4_epi64((v_s1), 1)); \
s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \
_mm512_extracti64x4_epi64((v_s2), 1)); \
\
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
}

/*
* This is a very silly partial workaround for gcc bug
* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to
Expand Down Expand Up @@ -105,15 +118,17 @@
static forceinline ATTRIBUTES void
adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
{
static const u16 _aligned_attribute(16) mults[4][16] = {
{ 32, 31, 30, 29, 28, 27, 26, 25 },
{ 24, 23, 22, 21, 20, 19, 18, 17 },
{ 16, 15, 14, 13, 12, 11, 10, 9 },
{ 8, 7, 6, 5, 4, 3, 2, 1 },
};
const __m128i /* __v8hu */ mults_a = _mm_load_si128((const __m128i *)mults[0]);
const __m128i /* __v8hu */ mults_b = _mm_load_si128((const __m128i *)mults[1]);
const __m128i /* __v8hu */ mults_c = _mm_load_si128((const __m128i *)mults[2]);
const __m128i /* __v8hu */ mults_d = _mm_load_si128((const __m128i *)mults[3]);
const __m128i zeroes = _mm_setzero_si128();
const __m128i /* __v8hu */ mults_a =
_mm_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25);
const __m128i /* __v8hu */ mults_b =
_mm_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17);
const __m128i /* __v8hu */ mults_c =
_mm_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9);
const __m128i /* __v8hu */ mults_d =
_mm_setr_epi16(8, 7, 6, 5, 4, 3, 2, 1);

/* s1 counters: 32-bit, sum of bytes */
__m128i /* __v4su */ v_s1 = zeroes;
Expand Down Expand Up @@ -209,23 +224,21 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
static forceinline ATTRIBUTES void
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
{
const __m256i zeroes = _mm256_setzero_si256();
/*
* Note, the multipliers have to be in this order because
* _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately.
*/
const __m256i /* __v16hu */ mults_a =
_mm256_setr_epi16(64, 63, 62, 61, 60, 59, 58, 57,
48, 47, 46, 45, 44, 43, 42, 41);
const __m256i /* __v16hu */ mults_b =
_mm256_setr_epi16(56, 55, 54, 53, 52, 51, 50, 49,
40, 39, 38, 37, 36, 35, 34, 33);
const __m256i /* __v16hu */ mults_c =
_mm256_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25,
16, 15, 14, 13, 12, 11, 10, 9);
const __m256i /* __v16hu */ mults_d =
_mm256_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17,
8, 7, 6, 5, 4, 3, 2, 1);
static const u16 _aligned_attribute(32) mults[4][16] = {
{ 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
{ 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
{ 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 },
{ 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 },
};
const __m256i /* __v16hu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]);
const __m256i /* __v16hu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]);
const __m256i /* __v16hu */ mults_c = _mm256_load_si256((const __m256i *)mults[2]);
const __m256i /* __v16hu */ mults_d = _mm256_load_si256((const __m256i *)mults[3]);
const __m256i zeroes = _mm256_setzero_si256();
__m256i /* __v8su */ v_s1 = zeroes;
__m256i /* __v8su */ v_s2 = zeroes;
__m256i /* __v16hu */ v_byte_sums_a = zeroes;
Expand Down Expand Up @@ -263,14 +276,93 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN */

#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
#define DEFAULT_IMPL adler32_avx2
#else
/*
* AVX512VNNI/AVX512BW implementation. Uses the dot product instruction
* vpdpbusd from AVX512VNNI and the vpsadbw instruction from AVX512BW.
* (vpdpbusd with ones could be used instead of vpsadbw, but it's slower.)
*
* We currently don't include an implementation using AVX512VNNI or AVX512BW
* alone, since CPUs with good AVX-512 implementations tend to have both.
*/
#if HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN
# define adler32_avx512vnni_avx512bw adler32_avx512vnni_avx512bw
# define FUNCNAME adler32_avx512vnni_avx512bw
# define FUNCNAME_CHUNK adler32_avx512vnni_avx512bw_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_LEN 128
# define IMPL_MAX_CHUNK_LEN (128 * (0xFFFF / 0xFF))
# if HAVE_AVX512VNNI_NATIVE && HAVE_AVX512BW_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx512vnni,avx512bw")
# endif
# include <immintrin.h>
/*
* With clang in MSVC compatibility mode, immintrin.h incorrectly skips
* including some sub-headers.
*/
# if defined(__clang__) && defined(_MSC_VER)
# include <tmmintrin.h>
# include <smmintrin.h>
# include <wmmintrin.h>
# include <avxintrin.h>
# include <avx2intrin.h>
# include <avx512fintrin.h>
# include <avx512bwintrin.h>
# include <avx512vnniintrin.h>
# endif
static forceinline ATTRIBUTES void
adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
{
static const u8 _aligned_attribute(64) mults[64] = {
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults);
const __m512i zeroes = _mm512_setzero_si512();
__m512i /* __v16su */ v_s1 = zeroes;
__m512i /* __v16su */ v_s1_sums_a = zeroes;
__m512i /* __v16su */ v_s1_sums_b = zeroes;
__m512i /* __v16su */ v_s2_a = zeroes;
__m512i /* __v16su */ v_s2_b = zeroes;

do {
const __m512i bytes_a = *p++;
const __m512i bytes_b = *p++;
__m512i v_sad_a, v_sad_b;

v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a);
v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a);
v_sad_a = _mm512_sad_epu8(bytes_a, zeroes);
v_sad_b = _mm512_sad_epu8(bytes_b, zeroes);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1);
v_s1 = _mm512_add_epi32(v_s1, v_sad_a);
v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1);
v_s1 = _mm512_add_epi32(v_s1, v_sad_b);
} while (p != end);

v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b);
v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 6);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2_a);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN */

static inline adler32_func_t
arch_select_adler32_func(void)
{
const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef adler32_avx512vnni_avx512bw
if ((features & X86_CPU_FEATURE_ZMM) &&
HAVE_AVX512VNNI(features) && HAVE_AVX512BW(features))
return adler32_avx512vnni_avx512bw;
#endif
#ifdef adler32_avx2
if (HAVE_AVX2(features))
return adler32_avx2;
Expand All @@ -282,6 +374,5 @@ arch_select_adler32_func(void)
return NULL;
}
#define arch_select_adler32_func arch_select_adler32_func
#endif

#endif /* LIB_X86_ADLER32_IMPL_H */
6 changes: 6 additions & 0 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_ZMM, "zmm"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
{X86_CPU_FEATURE_AVX512VNNI, "avx512vnni"},
};

volatile u32 libdeflate_x86_cpu_features = 0;
Expand Down Expand Up @@ -171,10 +173,14 @@ void libdeflate_init_x86_cpu_features(void)
features |= X86_CPU_FEATURE_ZMM;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512BW;
if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512VL;
if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
features |= X86_CPU_FEATURE_VPCLMULQDQ;
if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512VNNI;

out:
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
Expand Down
38 changes: 34 additions & 4 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,21 @@
*/
#define X86_CPU_FEATURE_ZMM 0x00000020
#define X86_CPU_FEATURE_AVX512F 0x00000040
#define X86_CPU_FEATURE_AVX512VL 0x00000080
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000100
#define X86_CPU_FEATURE_AVX512BW 0x00000080
#define X86_CPU_FEATURE_AVX512VL 0x00000100
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200
#define X86_CPU_FEATURE_AVX512VNNI 0x00000400

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX))
#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2))
#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2))
#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW))
#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))
#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI))

#if HAVE_DYNAMIC_X86_CPU_FEATURES
#define X86_CPU_FEATURES_KNOWN 0x80000000
Expand Down Expand Up @@ -182,6 +186,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_AVX512F_INTRIN 0
#endif

/* AVX-512BW */
#ifdef __AVX512BW__
# define HAVE_AVX512BW_NATIVE 1
#else
# define HAVE_AVX512BW_NATIVE 0
#endif
#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512BW_INTRIN 1
#else
# define HAVE_AVX512BW_INTRIN 0
#endif

/* AVX-512VL */
#ifdef __AVX512VL__
# define HAVE_AVX512VL_NATIVE 1
Expand All @@ -201,13 +218,26 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
#else
# define HAVE_VPCLMULQDQ_NATIVE 0
#endif
#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER))
#if HAVE_VPCLMULQDQ_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER)
# define HAVE_VPCLMULQDQ_INTRIN 1
#else
# define HAVE_VPCLMULQDQ_INTRIN 0
#endif

/* AVX512VNNI */
#ifdef __AVX512VNNI__
# define HAVE_AVX512VNNI_NATIVE 1
#else
# define HAVE_AVX512VNNI_NATIVE 0
#endif
#if HAVE_AVX512VNNI_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512VNNI_INTRIN 1
#else
# define HAVE_AVX512VNNI_INTRIN 0
#endif

#endif /* ARCH_X86_32 || ARCH_X86_64 */

#endif /* LIB_X86_CPU_FEATURES_H */
5 changes: 5 additions & 0 deletions scripts/checksum_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ echo
{
case $ARCH in
i386|x86_64)
if have_cpu_features avx512_vnni avx512bw; then
do_benchmark "AVX512VNNI/AVX512BW"
disable_cpu_feature "avx512vnni" "-mno-avx512vnni"
disable_cpu_feature "avx512bw" "-mno-avx512bw"
fi
if have_cpu_feature avx2; then
do_benchmark "AVX2"
disable_cpu_feature "avx2" "-mno-avx2"
Expand Down
4 changes: 2 additions & 2 deletions scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ build_and_run_tests()
if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
case "$ARCH" in
i386|x86_64)
features+=(zmm vpclmulqdq avx512vl avx512f
avx2 avx bmi2 pclmulqdq sse2)
features+=(zmm avx512vnni vpclmulqdq avx512vl avx512bw
avx512f avx2 avx bmi2 pclmulqdq sse2)
;;
arm*|aarch*)
features+=(dotprod sha3 crc32 pmull neon)
Expand Down

0 comments on commit a026a04

Please sign in to comment.