Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try to work around broken toolchains with too-old binutils #389

Merged
merged 1 commit into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,55 @@ if(LIBDEFLATE_FREESTANDING)
add_definitions(-DFREESTANDING)
endif()

# Check for cases where the compiler supports an instruction set extension but
# the assembler does not, and in those cases print a warning and add an
# appropriate -DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_* flag. libdeflate's C
# source files already check the compiler version before using the corresponding
# intrinsics, but in the rare case of gcc being paired with a binutils much
# older than itself those checks are insufficient. There is no way to check the
# assembler version from C. The proper fix for too-old binutils is for the user
# to upgrade binutils. Unfortunately, as libdeflate has started using newer
# instructions, binutils incompatibilities have started being seen more
# frequently. Hence these checks for assembler support here in CMakeLists.txt
# to provide a fallback for users who may be unable to fix their toolchain.
# These don't solve the problem for users not using CMake, though such users can
# add specific -DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_* flags they need.
function(check_assembler_support feature assembly_code)
execute_process(COMMAND echo "${assembly_code}"
COMMAND ${CMAKE_C_COMPILER} -c -x assembler -o /dev/null -
RESULT_VARIABLE result
ERROR_QUIET)
if(NOT ${result} EQUAL 0)
add_definitions(-DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_${feature})
message(STATUS "Your gcc supports ${feature} instructions but it is paired with an assembler that does not. Upgrading binutils is recommended.")
endif()
endfunction()
if(UNIX AND CMAKE_C_COMPILER_ID STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE machine)
if(${machine} MATCHES "^(x86_64|i[3-6]86)")
if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 8.1)
# Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI if needed.
check_assembler_support(AVX512_VNNI "vpdpbusd %zmm0, %zmm0, %zmm0")
# Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ if needed.
check_assembler_support(VPCLMULQDQ "vpclmulqdq $0, %zmm0, %zmm0, %zmm0")
endif()
if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1)
# Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX_VNNI if needed.
check_assembler_support(AVX_VNNI "{vex} vpdpbusd %ymm0, %ymm0, %ymm0")
endif()
elseif(${machine} MATCHES "^aarch64")
if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 8.1)
# Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD if needed.
check_assembler_support(DOTPROD ".arch armv8.2-a+dotprod\nudot v0.4s, v0.16b, v0.16b")
endif()
if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 9.1)
# Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3 if needed.
check_assembler_support(SHA3 ".arch armv8.2-a+sha3\neor3 v0.16b, v0.16b, v0.16b, v0.16b")
endif()
endif()
endif()

# Determine the list of source files and the list of compiler options that will
# be used for both the static library and the shared library.

Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,19 @@ for release builds. `-O3` is fine too, but often `-O2` actually gives better
results. It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though
you can do so if you want to. Most of the relevant optimized functions are
built regardless of such flags, and appropriate ones are selected at runtime.
For the same reason, flags like `-mno-avx2` do *not* cause all code using the
corresponding instruction set extension to be omitted from the binary; this is
working as intended due to the use of runtime CPU feature detection.

If using gcc, your gcc should always be paired with a binutils version that is
not much older than itself, to avoid problems where the compiler generates
instructions the assembler cannot assemble. Usually systems have their gcc and
binutils paired properly, but rarely a mismatch can arise in cases such as the
user installing a newer gcc version without a proper binutils alongside it.
Since libdeflate v1.22, the CMake-based build system will detect incompatible
binutils versions and disable some optimized code accordingly. In older
versions of libdeflate, or if CMake is not being used, a too-old binutils can
cause build errors like "no such instruction" from the assembler.

# API

Expand Down
3 changes: 2 additions & 1 deletion lib/arm/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
#endif /* Regular NEON implementation */

/* NEON+dotprod implementation */
#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD)
# define adler32_arm_neon_dotprod adler32_arm_neon_dotprod
# ifdef __clang__
# define ATTRIBUTES _target_attribute("dotprod")
Expand Down
3 changes: 2 additions & 1 deletion lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,8 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
* This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
* the sha3 extension) for even better performance.
*/
#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN
#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3)
# define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3
# define SUFFIX _pmullx12_crc_eor3
# ifdef __clang__
Expand Down
6 changes: 4 additions & 2 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@
* instead of gcc 11. (libdeflate supports direct compilation without a
* configure step, so checking the binutils version is not always an option.)
*/
#if GCC_PREREQ(12, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
#if (GCC_PREREQ(12, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX_VNNI)
# define adler32_x86_avx2_vnni adler32_x86_avx2_vnni
# define SUFFIX _avx2_vnni
# define ATTRIBUTES _target_attribute("avx2,avxvnni")
Expand All @@ -70,7 +71,8 @@
# include "adler32_template.h"
#endif

#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI)
/*
* AVX512VNNI implementation using 256-bit vectors. This is very similar to the
* AVX-VNNI implementation but takes advantage of masking and more registers.
Expand Down
6 changes: 4 additions & 2 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
* gcc 8.1 and 8.2 had a similar bug where they assumed that
* _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3.
*/
#if GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)
#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
# define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2
# define SUFFIX _vpclmulqdq_avx2
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2")
Expand All @@ -94,7 +95,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# include "crc32_pclmul_template.h"
#endif

#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
/*
* VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar
* to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
Expand Down
Loading