Skip to content

Commit

Permalink
AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C
Browse files Browse the repository at this point in the history
This implementation is based on crc32_refl_by16_vclmul_avx512
in https://github.com/intel/intel-ipsec-mb/ with some optimizations.

Changes to CMakeLists.txt and source/intel/asm/crc32c_sse42_asm.c
are based on #72.

This also fixes a bug in aws_checksums_crc32c_hw() when 128-bit pclmul
is not available. crc_intrin_fn was being invoked on bytes instead
of 32-bit or 64-bit words. The aws-checksums-tests was extended to cover
all SIMD implementations.

Note: The availability of the Intel CRC-32C instructions is checked
as part of testing AWS_CPU_FEATURE_SSE_4_2. Both ISA extensions were
introduced in the Intel Nehalem microarchitecture.

For compiling this, https://github.com/awslabs/aws-c-common must be
installed and CMAKE_MODULE_PATH must point to it, e.g.:
cmake -DCMAKE_MODULE_PATH=/usr/local/lib/cmake.

The AWS_CPU_FEATURE_AVX512 currently only checks for AVX512F and not
other features that this implementation depends on:
AVX512VL, AVX512BW, AVX512DQ. According to
https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512
there currently exist no CPUs that would support VPCLMULQDQ without
supporting all those AVX512 features.

The architecture target evex512 is something that was introduced as
mandatory in GCC 14 and clang 18 as part of introducing the AVX10.1-512
target, which basically is a new name for a number of AVX512 features.
Older compilers do not recognize this target, but they do emit EVEX
encoded instructions.
  • Loading branch information
dr-m committed Jul 8, 2024
1 parent 785e1b5 commit 8733d2f
Show file tree
Hide file tree
Showing 9 changed files with 609 additions and 51 deletions.
52 changes: 44 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,48 @@ file(GLOB AWS_ARCH_SRC
)

if (USE_CPU_EXTENSIONS)
if(AWS_ARCH_INTEL)
# First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
if(AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_SRC
"source/intel/asm/*.c"
if (AWS_ARCH_INTEL)
file (GLOB AWS_ARCH_INTEL_SRC
"source/intel/*.c"
)

if (AWS_HAVE_AVX512_INTRINSICS)
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
"source/intel/visualc/*.c"
)
elseif (MSVC)
file(GLOB AWS_ARCH_SRC
else()
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
)
endif()
else()
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/visualc/*.c"
)
endif()
endif()

source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})

if (AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_ASM_SRC
"source/intel/asm/*.c"
)

file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
${AWS_ARCH_ASM_SRC}
)
else()
file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
)
source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
endif()
endif()

Expand Down Expand Up @@ -114,6 +145,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC


add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})

aws_set_common_properties(${PROJECT_NAME})
aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
Expand All @@ -123,6 +155,10 @@ aws_add_sanitizers(${PROJECT_NAME})
# We are not ABI stable yet
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)

if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
SET_SOURCE_FILES_PROPERTIES(source/intel/crc_hw.c PROPERTIES COMPILE_FLAGS -msse4.2)
endif()

target_include_directories(${PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
Expand Down
13 changes: 11 additions & 2 deletions include/aws/checksums/private/crc_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,20 @@ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int leng
/* Computes the Castagnoli CRC32c (iSCSI) using a (slow) reference implementation. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_sw(const uint8_t *input, int length, uint32_t previousCrc32c);

/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes CRC32 (Ethernet, gzip, et. al.) using AVX512 and VPCLMULQDQ. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_avx512(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes the Castagnoli CRC32c (iSCSI). */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);
/* Computes the Castagnoli CRC32c (iSCSI) using 128-bit PCLMULQDQ. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_clmul(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes the Castagnoli CRC32c (iSCSI) using AVX512 and VPCLMULQDQ. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_avx512(const uint8_t *data, int length, uint32_t previousCrc32);

#ifdef __cplusplus
}
Expand Down
26 changes: 26 additions & 0 deletions include/aws/checksums/private/intel/crc32c_compiler_shims.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>

#include <aws/common/config.h>
#include <nmmintrin.h>

#if defined _WIN64 || defined __x86_64__
typedef uint64_t *slice_ptr_type;
typedef uint64_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u64
#else
typedef uint32_t *slice_ptr_type;
typedef uint32_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u32
#endif

#ifdef AWS_HAVE_AVX512_INTRINSICS
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc);
#endif

uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);
35 changes: 29 additions & 6 deletions source/crc.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,45 @@ static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t pre

uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) {
if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
#ifdef AWS_HAVE_ARM32_CRC
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
s_crc32_fn_ptr = aws_checksums_crc32_hw;
} else {
#elif defined AWS_HAVE_AVX512_INTRINSICS
if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
s_crc32_fn_ptr = aws_checksums_crc32_avx512;
#else
if (0) {}
#endif
else
s_crc32_fn_ptr = aws_checksums_crc32_sw;
}
}
return s_crc32_fn_ptr(input, length, previousCrc32);
}

uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32) {
if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) {
if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
#ifdef AWS_HAVE_ARM32_CRC
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
} else {
s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
#else
# ifdef AWS_HAVE_AVX512_INTRINSICS
if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
s_crc32c_fn_ptr = aws_checksums_crc32c_avx512;
else
# endif
if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2)) {
# ifdef AWS_HAVE_CLMUL
if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL))
s_crc32c_fn_ptr = aws_checksums_crc32c_clmul;
else
# endif
s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
}
#endif
else
s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
}
return s_crc32c_fn_ptr(input, length, previousCrc32);
}
22 changes: 9 additions & 13 deletions source/intel/asm/crc32c_sse42_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>

#include <aws/common/cpuid.h>

Expand Down Expand Up @@ -283,7 +283,7 @@ static bool detected_clmul = false;
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {

if (AWS_UNLIKELY(!detection_performed)) {
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
Expand All @@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
detection_performed = true;
}

uint32_t crc = ~previousCrc32;
/* this is called by a higher-level shim and previousCRC32 is already ~ */
uint32_t crc = previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (AWS_UNLIKELY(length < 8)) {
Expand Down Expand Up @@ -358,22 +359,17 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev

return ~crc;
}
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

# if defined(__clang__)
# pragma clang diagnostic pop
# endif

#else
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32c_sw(input, length, previousCrc32);
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
/* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
However, the sw function is also used as a standalone implementation that does need to do the
bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
}

#endif
/* clang-format on */
92 changes: 92 additions & 0 deletions source/intel/crc_hw.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
#include <aws/common/macros.h>

static uint32_t aws_checksums_crc32c_hw_small(const uint8_t *input, int length, uint32_t crc) {
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}
return ~crc;
}

static uint32_t aws_checksums_crc32c_hw_unaligned(const uint8_t **input, int *length, uint32_t crc) {
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
int input_alignment = (uintptr_t)(*input)&0x7;

/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
int leading = (8 - input_alignment) & 0x7;

/* reduce the length by the leading unaligned bytes we are about to process */
*length -= leading;

/* spin through the leading unaligned input bytes (if any) one-by-one */
while (leading-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *(*input)++);
}

return crc;
}

/*
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) instructions.
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {

/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
* branches.*/
uint32_t crc = ~previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (length < (int)sizeof(slice_ptr_int_type)) {
return aws_checksums_crc32c_hw_small(input, length, crc);
}

crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
while (length >= (int)sizeof(slice_ptr_int_type)) {
crc = (uint32_t)crc_intrin_fn(crc, *(const slice_ptr_int_type*) input);
input += sizeof(slice_ptr_int_type);
length -= (int)sizeof(slice_ptr_int_type);
}

/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input);
input++;
}

return ~crc;
}

/*
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
* PCLMULQDQ machine instructions (if present).
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_clmul(const uint8_t *input, int length, uint32_t previousCrc32) {

/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
* branches.*/
uint32_t crc = ~previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (length < (int)sizeof(slice_ptr_int_type)) {
return aws_checksums_crc32c_hw_small(input, length, crc);
}

crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);

return aws_checksums_crc32c_sse42(input, length, crc);
}

uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}
Loading

0 comments on commit 8733d2f

Please sign in to comment.