Skip to content

Commit

Permalink
lib/x86: disambiguate 512-bit vector from AVX-512F
Browse files Browse the repository at this point in the history
crc32_x86_vpclmulqdq_avx512vl and crc32_x86_vpclmulqdq_avx512f_avx512vl
actually use the same CPU features, considering that vpternlog always
requires at least avx512f, and compilers consider avx512vl to imply
avx512f.  Rename them to *_avx512_vl256 and *_avx512_vl512 to reflect
that they differ only in vector length, and fix the CPU feature checking
to use a separate flag for whether 512-bit vectors are enabled.
  • Loading branch information
ebiggers committed Feb 24, 2024
1 parent 2929aea commit ea028f1
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 36 deletions.
5 changes: 4 additions & 1 deletion lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_AVX, "avx"},
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_ZMM, "zmm"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
Expand Down Expand Up @@ -165,8 +166,10 @@ void libdeflate_init_x86_cpu_features(void)
features |= X86_CPU_FEATURE_AVX2;
if (b & (1 << 8))
features |= X86_CPU_FEATURE_BMI2;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) &&
if (((xcr0 & 0xe6) == 0xe6) &&
allow_512bit_vectors(manufacturer, family, model))
features |= X86_CPU_FEATURE_ZMM;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6))
features |= X86_CPU_FEATURE_AVX512VL;
Expand Down
13 changes: 10 additions & 3 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512F 0x00000020
#define X86_CPU_FEATURE_AVX512VL 0x00000040
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000080
/*
* ZMM indicates whether 512-bit vectors (zmm registers) should be used. On
* some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
* supports it, i.e. even if AVX512F is set. On these CPUs, we may still use
* AVX-512 instructions, but only with ymm and xmm registers.
*/
#define X86_CPU_FEATURE_ZMM 0x00000020
#define X86_CPU_FEATURE_AVX512F 0x00000040
#define X86_CPU_FEATURE_AVX512VL 0x00000080
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000100

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
Expand Down
39 changes: 21 additions & 18 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,16 @@
#endif

/*
* VPCLMULQDQ/AVX512VL implementation. This takes advantage of some AVX-512
* instructions but uses 256-bit vectors rather than 512-bit. This can be
* useful on CPUs where 512-bit vectors cause downclocking.
* VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage
* of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
* This can be useful on CPUs where 512-bit vectors cause downclocking.
*/
#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX512VL_INTRIN
# define crc32_x86_vpclmulqdq_avx512vl crc32_x86_vpclmulqdq_avx512vl
# define SUFFIX _vpclmulqdq_avx512vl
# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX512VL_NATIVE
#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
# define SUFFIX _vpclmulqdq_avx512_vl256
# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
Expand All @@ -113,16 +115,16 @@
# include "crc32_pclmul_template.h"
#endif

/* VPCLMULQDQ/AVX512F/AVX512VL implementation. Uses 512-bit vectors. */
/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
# define crc32_x86_vpclmulqdq_avx512f_avx512vl crc32_x86_vpclmulqdq_avx512f_avx512vl
# define SUFFIX _vpclmulqdq_avx512f_avx512vl
#if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
# define SUFFIX _vpclmulqdq_avx512_vl512
# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512f,avx512vl")
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
# endif
# define VL 64
# define FOLD_LESSTHAN16BYTES 1
Expand All @@ -136,15 +138,16 @@ arch_select_crc32_func(void)
{
const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef crc32_x86_vpclmulqdq_avx512f_avx512vl
if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
#ifdef crc32_x86_vpclmulqdq_avx512_vl512
if ((features & X86_CPU_FEATURE_ZMM) &&
HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
HAVE_AVX512F(features) && HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512f_avx512vl;
return crc32_x86_vpclmulqdq_avx512_vl512;
#endif
#ifdef crc32_x86_vpclmulqdq_avx512vl
#ifdef crc32_x86_vpclmulqdq_avx512_vl256
if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512vl;
HAVE_AVX512F(features) && HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512_vl256;
#endif
#ifdef crc32_x86_vpclmulqdq_avx2
if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
Expand Down
2 changes: 1 addition & 1 deletion lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
* VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1
* VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2
* VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl
* VL=64: at least vpclmulqdq,pclmul,avx512f,avx512vl
* VL=64: at least vpclmulqdq,pclmul,avx512vl
* VL:
* Vector length in bytes. Supported values are 16, 32, and 64.
* FOLD_LESSTHAN16BYTES:
Expand Down
22 changes: 10 additions & 12 deletions scripts/checksum_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,11 @@ sort_by_speed() {
}

disable_cpu_feature() {
local name="$1"
LIBDEFLATE_DISABLE_CPU_FEATURES+=",$1"
shift
local extra_cflags=("$@")

LIBDEFLATE_DISABLE_CPU_FEATURES+=",$name"
EXTRA_CFLAGS+=("${extra_cflags[@]}")
if (( $# > 0 )); then
EXTRA_CFLAGS+=("$@")
fi
}

cleanup() {
Expand Down Expand Up @@ -114,15 +113,14 @@ export LIBDEFLATE_DISABLE_CPU_FEATURES=""
{
case $ARCH in
i386|x86_64)
if have_cpu_features vpclmulqdq avx512f avx512vl; then
do_benchmark "VPCLMULQDQ/AVX512F/AVX512VL"
disable_cpu_feature "avx512f" "-mno-avx512f"
fi
if have_cpu_features vpclmulqdq avx512vl; then
do_benchmark "VPCLMULQDQ/AVX512VL"
if have_cpu_features vpclmulqdq pclmulqdq avx512f avx512vl; then
do_benchmark "VPCLMULQDQ/AVX512/VL512"
disable_cpu_feature "zmm"
do_benchmark "VPCLMULQDQ/AVX512/VL256"
disable_cpu_feature "avx512vl" "-mno-avx512vl"
disable_cpu_feature "avx512f" "-mno-avx512f"
fi
if have_cpu_features vpclmulqdq avx2; then
if have_cpu_features vpclmulqdq pclmulqdq avx2; then
do_benchmark "VPCLMULQDQ/AVX2"
disable_cpu_feature "vpclmulqdq" "-mno-vpclmulqdq"
fi
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ build_and_run_tests()
if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
case "$ARCH" in
i386|x86_64)
features+=(vpclmulqdq avx512vl avx512f
features+=(zmm vpclmulqdq avx512vl avx512f
avx2 avx bmi2 pclmulqdq sse2)
;;
arm*|aarch*)
Expand Down

0 comments on commit ea028f1

Please sign in to comment.