diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 56601764..fa04d646 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -82,7 +82,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_AVX2, "avx2"}, {X86_CPU_FEATURE_BMI2, "bmi2"}, {X86_CPU_FEATURE_ZMM, "zmm"}, - {X86_CPU_FEATURE_AVX512F, "avx512f"}, {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, @@ -163,8 +162,6 @@ void libdeflate_init_x86_cpu_features(void) if (((xcr0 & 0xe6) == 0xe6) && allow_512bit_vectors(manufacturer, family, model)) features |= X86_CPU_FEATURE_ZMM; - if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) - features |= X86_CPU_FEATURE_AVX512F; if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6)) features |= X86_CPU_FEATURE_AVX512BW; if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6)) diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index d5d3f2ac..8dda21fd 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -39,17 +39,16 @@ #define X86_CPU_FEATURE_BMI2 (1 << 4) /* * ZMM indicates whether 512-bit vectors (zmm registers) should be used. On - * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU - * supports it, i.e. even if AVX512F is set. On these CPUs, we may still use - * AVX-512 instructions, but only with ymm and xmm registers. + * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU and + * operating system support AVX-512. On these CPUs, we may still use AVX-512 + * instructions, but only with xmm and ymm registers. */ #define X86_CPU_FEATURE_ZMM (1 << 5) -#define X86_CPU_FEATURE_AVX512F (1 << 6) -#define X86_CPU_FEATURE_AVX512BW (1 << 7) -#define X86_CPU_FEATURE_AVX512VL (1 << 8) -#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 9) -#define X86_CPU_FEATURE_AVX512VNNI (1 << 10) -#define X86_CPU_FEATURE_AVXVNNI (1 << 11) +#define X86_CPU_FEATURE_AVX512BW (1 << 6) +#define X86_CPU_FEATURE_AVX512VL (1 << 7) +#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 8) +#define X86_CPU_FEATURE_AVX512VNNI (1 << 9) +#define X86_CPU_FEATURE_AVXVNNI (1 << 10) #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) /* Runtime x86 CPU feature detection is supported. */ @@ -135,12 +134,6 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_BMI2_NATIVE 0 #endif -#ifdef __AVX512F__ -# define HAVE_AVX512F(features) 1 -#else -# define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F) -#endif - #ifdef __AVX512BW__ # define HAVE_AVX512BW(features) 1 #else diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index 8b23b904..462e6d88 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -30,6 +30,19 @@ #include "cpu_features.h" +/* + * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. + * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. + */ +static const u8 MAYBE_UNUSED shift_tab[48] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; + #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) /* PCLMULQDQ implementation */ # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq @@ -88,7 +101,7 @@ */ # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") # define VL 32 # define USE_SSE4_1 1 # define USE_AVX512 1 @@ -101,7 +114,7 @@ */ # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") # define VL 64 # define USE_SSE4_1 1 # define USE_AVX512 1 @@ -116,12 +129,12 @@ arch_select_crc32_func(void) #ifdef crc32_x86_vpclmulqdq_avx512_vl512 if ((features & X86_CPU_FEATURE_ZMM) && HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && - HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) return crc32_x86_vpclmulqdq_avx512_vl512; #endif #ifdef crc32_x86_vpclmulqdq_avx512_vl256 if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && - HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) return crc32_x86_vpclmulqdq_avx512_vl256; #endif #ifdef crc32_x86_vpclmulqdq_avx2 diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index bb892d82..0fe38f8a 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -37,8 +37,8 @@ * VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul * VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1 * VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2 - * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl - * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl + * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl + * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl * (Other combinations are not useful and have not been tested.) * VL: * Vector length in bytes. Must be 16, 32, or 64. @@ -162,18 +162,6 @@ static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, __m128i /* __v2du */ mults_128b) { - /* - * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. - * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. - */ - static const u8 shift_tab[48] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - }; __m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]); __m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]); __m128i x0, x1; @@ -216,70 +204,77 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) const __m128i barrett_reduction_constants = _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1); vec_t v0, v1, v2, v3, v4, v5, v6, v7; - __m128i x0, x1; - - /* - * There are two overall code paths. The first path supports all - * lengths, but is intended for short lengths; it uses unaligned loads - * and does at most 4-way folds. The second path only supports longer - * lengths, aligns the pointer in order to do aligned loads, and does up - * to 8-way folds. The length check below decides which path to take. - */ - if (len < 64*VL) { - if (len < VL) - return crc32_slice1(crc, p, len); - - v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc))); - p += VL; + __m128i x0 = _mm_cvtsi32_si128(crc); + __m128i x1; - if (len >= 4*VL) { - v1 = VLOADU(p + 0*VL); - v2 = VLOADU(p + 1*VL); - v3 = VLOADU(p + 2*VL); - p += 3*VL; - while (len >= 8*VL) { - v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v); - v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v); - v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v); - v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v); - p += 4*VL; - len -= 4*VL; + if (len < 8*VL) { + if (len < VL) { + STATIC_ASSERT(VL == 16 || VL == 32 || VL == 64); + if (len < 16) { + #if USE_AVX512 + if (len < 4) + return crc32_slice1(crc, p, len); + /* + * Handle 4 <= len <= 15 bytes by doing a masked + * load, XOR'ing the current CRC with the first + * 4 bytes, left-shifting by '16 - len' bytes to + * align the result to the end of x0 (so that it + * becomes the low-order coefficients of a + * 128-bit polynomial), and then doing the usual + * reduction from 128 bits to 32 bits. + */ + x0 = _mm_xor_si128( + x0, _mm_maskz_loadu_epi8((1 << len) - 1, p)); + x0 = _mm_shuffle_epi8( + x0, _mm_loadu_si128((const void *)&shift_tab[len])); + goto reduce_x0; + #else + return crc32_slice1(crc, p, len); + #endif } - v0 = fold_vec(v0, v2, mults_2v); - v1 = fold_vec(v1, v3, mults_2v); - if (len & (2*VL)) { - v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v); - v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v); - p += 2*VL; - } - v0 = fold_vec(v0, v1, mults_1v); - if (len & VL) { - v0 = fold_vec(v0, VLOADU(p), mults_1v); - p += VL; - } - } else { - if (len >= 2*VL) { - v0 = fold_vec(v0, VLOADU(p), mults_1v); - p += VL; - if (len >= 3*VL) { - v0 = fold_vec(v0, VLOADU(p), mults_1v); - p += VL; - } + /* + * Handle 16 <= len < VL bytes where VL is 32 or 64. + * Use 128-bit instructions so that these lengths aren't + * slower with VL > 16 than with VL=16. + */ + x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0); + if (len >= 32) { + x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 16)), + mults_128b); + if (len >= 48) + x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 32)), + mults_128b); } + p += len & ~15; + goto less_than_16_remaining; } + v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0)); + if (len < 2*VL) { + p += VL; + goto less_than_vl_remaining; + } + v1 = VLOADU(p + 1*VL); + if (len < 4*VL) { + p += 2*VL; + goto less_than_2vl_remaining; + } + v2 = VLOADU(p + 2*VL); + v3 = VLOADU(p + 3*VL); + p += 4*VL; } else { - size_t align = -(uintptr_t)p & (VL-1); - const vec_t *vp; + /* + * If the length is large and the pointer is misaligned, align + * it. For smaller lengths, just take the misaligned load + * penalty. Note that on recent x86 CPUs, vmovdqu with an + * aligned address is just as fast as vmovdqa, so there's no + * need to use vmovdqa in the main loop. + */ + if (len > 65536 && ((uintptr_t)p & (VL-1))) { + size_t align = -(uintptr_t)p & (VL-1); - /* Align p to the next VL-byte boundary. */ - if (align == 0) { - vp = (const vec_t *)p; - v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); - } else { len -= align; #if USE_SSE4_1 - x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), - _mm_cvtsi32_si128(crc)); + x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0); p += 16; if (align & 15) { x0 = fold_lessthan16bytes(x0, p, align & 15, @@ -287,7 +282,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) p += align & 15; align &= ~15; } - while (align >= 16) { + while (align) { x0 = fold_vec128(x0, *(const __m128i *)p, mults_128b); p += 16; @@ -296,66 +291,75 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) v0 = M128I_TO_VEC(x0); # if VL == 32 v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1); - p += 16; # elif VL == 64 v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1); - p += 16; - v0 = _mm512_inserti64x4(v0, *(const __m256i *)p, 1); - p += 32; + v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1); # endif - vp = (const vec_t *)p; + p -= 16; #else crc = crc32_slice1(crc, p, align); p += align; - vp = (const vec_t *)p; - v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc))); #endif + } else { + v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0)); } - v1 = *vp++; - v2 = *vp++; - v3 = *vp++; - v4 = *vp++; - v5 = *vp++; - v6 = *vp++; - v7 = *vp++; - do { - v0 = fold_vec(v0, *vp++, mults_8v); - v1 = fold_vec(v1, *vp++, mults_8v); - v2 = fold_vec(v2, *vp++, mults_8v); - v3 = fold_vec(v3, *vp++, mults_8v); - v4 = fold_vec(v4, *vp++, mults_8v); - v5 = fold_vec(v5, *vp++, mults_8v); - v6 = fold_vec(v6, *vp++, mults_8v); - v7 = fold_vec(v7, *vp++, mults_8v); - len -= 8*VL; - } while (len >= 16*VL); + v1 = VLOADU(p + 1*VL); + v2 = VLOADU(p + 2*VL); + v3 = VLOADU(p + 3*VL); + v4 = VLOADU(p + 4*VL); + v5 = VLOADU(p + 5*VL); + v6 = VLOADU(p + 6*VL); + v7 = VLOADU(p + 7*VL); + p += 8*VL; /* - * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes) - * and fold in any VL-byte data segments that remain. + * This is the main loop, processing 8*VL bytes per iteration. + * 4*VL is usually enough and would result in smaller code, but + * Skylake and Cascade Lake need 8*VL to get full performance. */ + while (len >= 16*VL) { + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_8v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_8v); + v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_8v); + v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_8v); + v4 = fold_vec(v4, VLOADU(p + 4*VL), mults_8v); + v5 = fold_vec(v5, VLOADU(p + 5*VL), mults_8v); + v6 = fold_vec(v6, VLOADU(p + 6*VL), mults_8v); + v7 = fold_vec(v7, VLOADU(p + 7*VL), mults_8v); + p += 8*VL; + len -= 8*VL; + } + + /* Fewer than 8*VL bytes remain. */ v0 = fold_vec(v0, v4, mults_4v); v1 = fold_vec(v1, v5, mults_4v); v2 = fold_vec(v2, v6, mults_4v); v3 = fold_vec(v3, v7, mults_4v); if (len & (4*VL)) { - v0 = fold_vec(v0, *vp++, mults_4v); - v1 = fold_vec(v1, *vp++, mults_4v); - v2 = fold_vec(v2, *vp++, mults_4v); - v3 = fold_vec(v3, *vp++, mults_4v); - } - v0 = fold_vec(v0, v2, mults_2v); - v1 = fold_vec(v1, v3, mults_2v); - if (len & (2*VL)) { - v0 = fold_vec(v0, *vp++, mults_2v); - v1 = fold_vec(v1, *vp++, mults_2v); + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v); + v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v); + v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v); + p += 4*VL; } - v0 = fold_vec(v0, v1, mults_1v); - if (len & VL) - v0 = fold_vec(v0, *vp++, mults_1v); - p = (const u8 *)vp; } - + /* Fewer than 4*VL bytes remain. */ + v0 = fold_vec(v0, v2, mults_2v); + v1 = fold_vec(v1, v3, mults_2v); + if (len & (2*VL)) { + v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v); + v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v); + p += 2*VL; + } +less_than_2vl_remaining: + /* Fewer than 2*VL bytes remain. */ + v0 = fold_vec(v0, v1, mults_1v); + if (len & VL) { + v0 = fold_vec(v0, VLOADU(p), mults_1v); + p += VL; + } +less_than_vl_remaining: /* * Fewer than VL bytes remain. Reduce v0 (length VL bytes) to x0 * (length 16 bytes) and fold in any 16-byte data segments that remain. @@ -388,6 +392,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) p += 16; } #endif +less_than_16_remaining: len &= 15; /* @@ -397,6 +402,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) #if USE_SSE4_1 if (len) x0 = fold_lessthan16bytes(x0, p, len, mults_128b); +#endif +#if USE_AVX512 +reduce_x0: #endif /* diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index 12fb405f..94f281aa 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -153,12 +153,12 @@ export LIBDEFLATE_DISABLE_CPU_FEATURES="" { case $ARCH in i386|x86_64) - if have_cpu_features vpclmulqdq pclmulqdq avx512f avx512vl; then + if have_cpu_features vpclmulqdq pclmulqdq avx512bw avx512vl; then do_benchmark "VPCLMULQDQ/AVX512/VL512" disable_cpu_feature zmm do_benchmark "VPCLMULQDQ/AVX512/VL256" disable_cpu_feature avx512vl "-mno-avx512vl" - disable_cpu_feature avx512f "-mno-avx512f" + disable_cpu_feature avx512bw "-mno-avx512bw" fi if have_cpu_features vpclmulqdq pclmulqdq avx2; then do_benchmark "VPCLMULQDQ/AVX2"