diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4e96dd7..0e09440 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,7 +6,7 @@ on: - 'main' env: - BUILDER_VERSION: v0.9.62 + BUILDER_VERSION: v0.9.63 BUILDER_SOURCE: releases BUILDER_HOST: https://d19elf31gohf1l.cloudfront.net PACKAGE_NAME: aws-checksums @@ -146,6 +146,18 @@ jobs: chmod a+x builder ./builder build -p ${{ env.PACKAGE_NAME }} + osx-m1: + runs-on: macos-14-xlarge # latest arm build + strategy: + matrix: + arch: [ macos-armv8 ] + steps: + - name: Build ${{ env.PACKAGE_NAME }} + consumers + run: | + python3 -c "from urllib.request import urlretrieve; urlretrieve('${{ env.BUILDER_HOST }}/${{ env.BUILDER_SOURCE }}/${{ env.BUILDER_VERSION }}/builder.pyz?run=${{ env.RUN }}', 'builder')" + chmod a+x builder + ./builder build -p ${{ env.PACKAGE_NAME }} --target=${{matrix.arch}} + macos-x64: runs-on: macos-14-large # latest steps: diff --git a/CMakeLists.txt b/CMakeLists.txt index a21bc36..c473544 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${A # Append that generated list to the module search path list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH}) +include(AwsSIMD) include(AwsCFlags) include(AwsCheckHeaders) include(AwsSharedLibSetup) @@ -53,54 +54,6 @@ if(MSVC) source_group("Source Files" FILES ${AWS_CHECKSUMS_SRC}) endif() -file(GLOB AWS_ARCH_SRC - "source/generic/*.c" - ) - -if (USE_CPU_EXTENSIONS) - if(AWS_ARCH_INTEL) - # First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang. - if(AWS_HAVE_GCC_INLINE_ASM) - file(GLOB AWS_ARCH_SRC - "source/intel/asm/*.c" - ) - elseif (MSVC) - file(GLOB AWS_ARCH_SRC - "source/intel/visualc/*.c" - ) - source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC}) - endif() - endif() - - if (MSVC AND AWS_ARCH_ARM64) - file(GLOB AWS_ARCH_SRC - "source/arm/*.c" - ) - source_group("Source Files\\arm" FILES ${AWS_ARCH_SRC}) - - elseif (AWS_ARCH_ARM64) - file(GLOB AWS_ARCH_SRC - "source/arm/*.c" - ) - SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc ) - elseif ((NOT MSVC) AND AWS_ARCH_ARM32) - set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+crc -Werror") - check_c_source_compiles(" - #include - int main() { - int crc = __crc32d(0, 1); - return 0; - }" AWS_ARM32_CRC) - unset(CMAKE_REQUIRED_FLAGS) - if (AWS_ARM32_CRC) - file(GLOB AWS_ARCH_SRC - "source/arm/*.c" - ) - SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc ) - endif() - endif() -endif() - file(GLOB CHECKSUMS_COMBINED_HEADERS ${AWS_CHECKSUMS_HEADERS} ${AWS_CHECKSUMS_PRIV_HEADERS} @@ -109,11 +62,11 @@ file(GLOB CHECKSUMS_COMBINED_HEADERS file(GLOB CHECKSUMS_COMBINED_SRC ${AWS_CHECKSUMS_SRC} ${AWS_CHECKSUMS_PLATFORM_SOURCE} - ${AWS_ARCH_SRC} ) add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC}) + aws_set_common_properties(${PROJECT_NAME}) aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS") aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS}) @@ -123,6 +76,63 @@ aws_add_sanitizers(${PROJECT_NAME}) # We are not ABI stable yet set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0) +if (USE_CPU_EXTENSIONS) + if (AWS_ARCH_INTEL) + file (GLOB AWS_ARCH_INTEL_SRC + "source/intel/*.c" + ) + + if (MSVC) + file(GLOB AWS_ARCH_INTRIN_SRC + "source/intel/intrin/*.c" + ) + + source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC}) + source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC}) + else() + if (AWS_HAVE_GCC_INLINE_ASM) + simd_append_source_and_features(${PROJECT_NAME} "source/intel/asm/crc32c_sse42_asm.c" ${AWS_SSE4_2_FLAG}) + endif() + endif() + + + set(UBER_FILE_FLAGS "") + if (AWS_HAVE_AVX512_INTRINSICS) + list(APPEND UBER_FILE_FLAGS ${AWS_AVX512_FLAG}) + list(APPEND UBER_FILE_FLAGS ${AWS_AVX512vL_FLAG}) + list(APPEND UBER_FILE_FLAGS ${AWS_AVX2_FLAG}) + simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64nvme_avx512.c" ${AWS_AVX512_FLAG} ${AWS_AVX512vL_FLAG} ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG}) + + endif() + + if (AWS_HAVE_CLMUL) + list(APPEND UBER_FILE_FLAGS ${AWS_CLMUL_FLAG}) + endif() + + list(APPEND UBER_FILE_FLAGS "${AWS_SSE4_2_FLAG}") + + # this file routes all of the implementations together based on available cpu features. It gets built regardless + # of which flags exist. The c file sorts it out. + simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc32c_sse42_avx512.c" ${UBER_FILE_FLAGS}) + + if (AWS_HAVE_CLMUL) + simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64nvme_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG}) + endif() + + + elseif(AWS_ARCH_ARM64 OR (AWS_ARCH_ARM32 AND AWS_HAVE_ARM32_CRC)) + simd_append_source_and_features(${PROJECT_NAME} "source/arm/crc32c_arm.c" ${AWS_ARMv8_1_FLAG}) + simd_append_source_and_features(${PROJECT_NAME} "source/arm/crc64_arm.c" ${AWS_ARMv8_1_FLAG}) + + if (MSVC) + file(GLOB AWS_ARCH_SRC + "source/arm/*.c" + ) + source_group("Source Files\\arm" FILES ${AWS_ARCH_SRC}) + endif() + endif() +endif() + target_include_directories(${PROJECT_NAME} PUBLIC $ $) @@ -156,4 +166,5 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake" include(CTest) if (BUILD_TESTING) add_subdirectory(tests) + add_subdirectory(bin/benchmark) endif () diff --git a/bin/benchmark/CMakeLists.txt b/bin/benchmark/CMakeLists.txt new file mode 100644 index 0000000..9f8f791 --- /dev/null +++ b/bin/benchmark/CMakeLists.txt @@ -0,0 +1,29 @@ +project(checksum-profile C) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_INSTALL_PREFIX}/lib/cmake") + +file(GLOB PROFILE_SRC + "*.c" + ) + +set(PROFILE_PROJECT_NAME checksum-profile) +add_executable(${PROFILE_PROJECT_NAME} ${PROFILE_SRC}) +aws_set_common_properties(${PROFILE_PROJECT_NAME}) + + +target_include_directories(${PROFILE_PROJECT_NAME} PUBLIC + $ + $) + +target_link_libraries(${PROFILE_PROJECT_NAME} PRIVATE aws-checksums) + +if (BUILD_SHARED_LIBS AND NOT WIN32) + message(INFO " checksum-profile will be built with shared libs, but you may need to set LD_LIBRARY_PATH=${CMAKE_INSTALL_PREFIX}/lib to run the application") +endif() + +install(TARGETS ${PROFILE_PROJECT_NAME} + EXPORT ${PROFILE_PROJECT_NAME}-targets + COMPONENT Runtime + RUNTIME + DESTINATION bin + COMPONENT Runtime) diff --git a/bin/benchmark/main.c b/bin/benchmark/main.c new file mode 100644 index 0000000..4dd0dfb --- /dev/null +++ b/bin/benchmark/main.c @@ -0,0 +1,127 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +struct aws_allocator_types { + struct aws_allocator *allocator; + const char *name; +}; + +struct checksum_profile_run { + void (*profile_run)(struct aws_byte_cursor checksum_this); + const char *name; +}; + +static void s_runcrc32_sw(struct aws_byte_cursor checksum_this) { + uint32_t crc = aws_checksums_crc32_sw(checksum_this.ptr, (int)checksum_this.len, 0); + (void)crc; +} + +static void s_runcrc32(struct aws_byte_cursor checksum_this) { + uint32_t crc = aws_checksums_crc32(checksum_this.ptr, (int)checksum_this.len, 0); + (void)crc; +} + +static void s_runcrc32c_sw(struct aws_byte_cursor checksum_this) { + uint32_t crc = aws_checksums_crc32c_sw(checksum_this.ptr, (int)checksum_this.len, 0); + (void)crc; +} + +static void s_runcrc32c(struct aws_byte_cursor checksum_this) { + uint32_t crc = aws_checksums_crc32c(checksum_this.ptr, (int)checksum_this.len, 0); + (void)crc; +} + +static void s_runcrc64_sw(struct aws_byte_cursor checksum_this) { + uint64_t crc = aws_checksums_crc64nvme_sw(checksum_this.ptr, (int)checksum_this.len, 0); + (void)crc; +} + +static void s_runcrc64(struct aws_byte_cursor checksum_this) { + uint64_t crc = aws_checksums_crc64nvme(checksum_this.ptr, (int)checksum_this.len, 0); + (void)crc; +} + +int main(void) { + + fprintf(stdout, "hw features for this run:\n"); + fprintf(stdout, "clmul: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) ? "true" : "false"); + fprintf(stdout, "sse4.1: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_1) ? "true" : "false"); + fprintf(stdout, "sse4.2: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) ? "true" : "false"); + fprintf(stdout, "avx2: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2) ? "true" : "false"); + fprintf(stdout, "avx512: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) ? "true" : "false"); + fprintf(stdout, "arm crc: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC) ? "true" : "false"); + fprintf(stdout, "bmi2: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_BMI2) ? "true" : "false"); + fprintf(stdout, "vpclmul: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ) ? "true" : "false"); + fprintf(stdout, "arm pmull: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_PMULL) ? "true" : "false"); + fprintf(stdout, "arm crypto: %s\n\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRYPTO) ? "true" : "false"); + + struct aws_allocator_types allocators[2]; + allocators[0].allocator = aws_default_allocator(); + allocators[0].name = "Default runtime allocator"; + allocators[1].allocator = aws_aligned_allocator(); + allocators[1].name = "Aligned allocator"; + + struct checksum_profile_run profile_runs[] = { + {.profile_run = s_runcrc32_sw, .name = "crc32 C only"}, + {.profile_run = s_runcrc32, .name = "crc32 with hw optimizations"}, + {.profile_run = s_runcrc32c_sw, .name = "crc32c C only"}, + {.profile_run = s_runcrc32c, .name = "crc32c with hw optimizations"}, + {.profile_run = s_runcrc64_sw, .name = "crc64nvme C only"}, + {.profile_run = s_runcrc64, .name = "crc64nvme with hw optimizations"}, + }; + + const size_t allocators_array_size = AWS_ARRAY_SIZE(allocators); + const size_t profile_runs_size = AWS_ARRAY_SIZE(profile_runs); + + for (size_t i = 0; i < profile_runs_size; ++i) { + fprintf(stdout, "--------Profile %s---------\n", profile_runs[i].name); + + for (size_t j = 0; j < allocators_array_size; ++j) { + fprintf(stdout, "%s\n\n", allocators[j].name); + + struct aws_allocator *allocator = allocators[j].allocator; + + // get buffer sizes large enough that all the simd code paths get hit hard, but + // also measure the smaller buffer paths since they often can't be optimized as thoroughly. + size_t buffer_sizes[] = {8, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; + size_t buffer_sizes_len = AWS_ARRAY_SIZE(buffer_sizes); + + // warm it up to factor out the cpuid checks: + struct aws_byte_cursor warmup_cur = aws_byte_cursor_from_array(buffer_sizes, buffer_sizes_len); + profile_runs[i].profile_run(warmup_cur); + + for (size_t k = 0; k < buffer_sizes_len; ++k) { + struct aws_byte_buf x_bytes; + aws_byte_buf_init(&x_bytes, allocator, buffer_sizes[k]); + aws_device_random_buffer(&x_bytes); + uint64_t start_time = 0; + aws_high_res_clock_get_ticks(&start_time); + profile_runs[i].profile_run(aws_byte_cursor_from_buf(&x_bytes)); + uint64_t end_time = 0; + aws_high_res_clock_get_ticks(&end_time); + fprintf( + stdout, + "buffer size %zu (bytes), latency: %" PRIu64 " ns\n", + buffer_sizes[k], + end_time - start_time); + aws_byte_buf_clean_up(&x_bytes); + } + fprintf(stdout, "\n"); + } + } + return 0; +} diff --git a/builder.json b/builder.json index 31e9973..b4d4180 100644 --- a/builder.json +++ b/builder.json @@ -6,5 +6,9 @@ "downstream": [ { "name": "aws-c-event-stream" }, { "name": "aws-c-s3" } + ], + "test_steps": [ + "test", + "{install_dir}/bin/checksum-profile{exe}" ] } diff --git a/include/aws/checksums/crc.h b/include/aws/checksums/crc.h index e53e7a6..33cd24c 100644 --- a/include/aws/checksums/crc.h +++ b/include/aws/checksums/crc.h @@ -18,7 +18,7 @@ AWS_EXTERN_C_BEGIN * Pass 0 in the previousCrc32 parameter as an initial value unless continuing * to update a running crc in a subsequent call. */ -AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32); +AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32); /** * The entry point function to perform a Castagnoli CRC32c (iSCSI) computation. @@ -26,7 +26,17 @@ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, * Pass 0 in the previousCrc32 parameter as an initial value unless continuing * to update a running crc in a subsequent call. */ -AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32); +AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previous_crc32c); + +/** + * The entry point function to perform a CRC64-NVME (a.k.a. CRC64-Rocksoft) computation. + * Selects a suitable implementation based on hardware capabilities. + * Pass 0 in the previousCrc64 parameter as an initial value unless continuing + * to update a running crc in a subsequent call. + * There are many variants of CRC64 algorithms. This CRC64 variant is bit-reflected (based on + * the non bit-reflected polynomial 0xad93d23594c93659) and inverts the CRC input and output bits. + */ +AWS_CHECKSUMS_API uint64_t aws_checksums_crc64nvme(const uint8_t *input, int length, uint64_t previous_crc64); AWS_EXTERN_C_END AWS_POP_SANE_WARNING_LEVEL diff --git a/include/aws/checksums/private/crc64_priv.h b/include/aws/checksums/private/crc64_priv.h new file mode 100644 index 0000000..f070dd7 --- /dev/null +++ b/include/aws/checksums/private/crc64_priv.h @@ -0,0 +1,51 @@ +#ifndef AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H +#define AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H + +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include +#include +#include +#include + +AWS_EXTERN_C_BEGIN + +AWS_CHECKSUMS_API uint64_t aws_checksums_crc64nvme_sw(const uint8_t *input, int length, uint64_t prev_crc64); + +#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && \ + !(defined(_MSC_VER) && _MSC_VER < 1920) +uint64_t aws_checksums_crc64nvme_intel_clmul(const uint8_t *input, int length, uint64_t previous_crc_64); +#endif /* defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920) */ + +#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_AVX2_INTRINSICS) && \ + !(defined(_MSC_VER) && _MSC_VER < 1920) +uint64_t aws_checksums_crc64nvme_intel_avx512(const uint8_t *input, int length, uint64_t previous_crc_64); +#endif /* defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_AVX2_INTRINSICS) && !(defined(_MSC_VER) && _MSC_VER < 1920) \ + */ + +#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64) +uint64_t aws_checksums_crc64nvme_arm_pmull(const uint8_t *input, int length, uint64_t previous_crc_64); +#endif /* INTPTR_MAX == INT64_MAX && defined(AWS_HAVE_ARMv8_1) */ + +/* Pre-computed constants for CRC64 */ +typedef struct { + uint64_t x2048[8]; /* x^2112 mod P(x) / x^2048 mod P(x) */ + uint64_t x1536[8]; /* x^1600 mod P(x) / x^1536 mod P(x) */ + uint64_t x1024[8]; /* x^1088 mod P(x) / x^1024 mod P(x) */ + uint64_t x512[8]; /* x^576 mod P(x) / x^512 mod P(x) */ + uint64_t x384[2]; /* x^448 mod P(x) / x^384 mod P(x) */ + uint64_t x256[2]; /* x^320 mod P(x) / x^256 mod P(x) */ + uint64_t x128[2]; /* x^192 mod P(x) / x^128 mod P(x) */ + uint64_t mu_poly[2]; /* Barrett mu / polynomial P(x) */ + uint64_t trailing[15][2]; /* Folding constants for 15 possible trailing input data lengths */ +} aws_checksums_crc64_constants_t; + +extern uint8_t aws_checksums_masks_shifts[6][16]; +extern aws_checksums_crc64_constants_t aws_checksums_crc64nvme_constants; + +AWS_EXTERN_C_END + +#endif /* AWS_CHECKSUMS_PRIVATE_CRC64_PRIV_H */ diff --git a/include/aws/checksums/private/crc_priv.h b/include/aws/checksums/private/crc_priv.h index 221c86f..678677d 100644 --- a/include/aws/checksums/private/crc_priv.h +++ b/include/aws/checksums/private/crc_priv.h @@ -6,13 +6,13 @@ */ #define AWS_CRC32_SIZE_BYTES 4 - #include +#include + +#include #include -#ifdef __cplusplus -extern "C" { -#endif +AWS_EXTERN_C_BEGIN /* Computes CRC32 (Ethernet, gzip, et. al.) using a (slow) reference implementation. */ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int length, uint32_t previousCrc32); @@ -20,14 +20,31 @@ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int leng /* Computes the Castagnoli CRC32c (iSCSI) using a (slow) reference implementation. */ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_sw(const uint8_t *input, int length, uint32_t previousCrc32c); -/* Computes the Castagnoli CRC32c (iSCSI). */ -AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32); +#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64) +uint32_t aws_checksums_crc32_armv8(const uint8_t *input, int length, uint32_t previous_crc32); +uint32_t aws_checksums_crc32c_armv8(const uint8_t *input, int length, uint32_t previous_crc32c); +#elif defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL) +# if defined(AWS_ARCH_INTEL_X64) +typedef uint64_t *slice_ptr_type; +typedef uint64_t slice_ptr_int_type; +# define crc_intrin_fn _mm_crc32_u64 + +# if !defined(_MSC_VER) +uint32_t aws_checksums_crc32c_clmul_sse42(const uint8_t *data, int length, uint32_t previous_crc32c); +# endif + +# else +typedef uint32_t *slice_ptr_type; +typedef uint32_t slice_ptr_int_type; +# define crc_intrin_fn _mm_crc32_u32 +# endif +uint32_t aws_checksums_crc32c_intel_avx512_with_sse_fallback( + const uint8_t *input, + int length, + uint32_t previous_crc32c); -/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */ -AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32); - -#ifdef __cplusplus -} #endif +AWS_EXTERN_C_END + #endif /* AWS_CHECKSUMS_PRIVATE_CRC_PRIV_H */ diff --git a/source/arm/crc32c_arm.c b/source/arm/crc32c_arm.c index 465e672..bff0371 100644 --- a/source/arm/crc32c_arm.c +++ b/source/arm/crc32c_arm.c @@ -14,8 +14,8 @@ # define PREFETCH(p) __builtin_prefetch(p) # endif -uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32) { - uint32_t crc = ~previousCrc32; +uint32_t aws_checksums_crc32c_armv8(const uint8_t *data, int length, uint32_t previous_crc32c) { + uint32_t crc = ~previous_crc32c; // Align data if it's not aligned while (((uintptr_t)data & 7) && length > 0) { @@ -54,8 +54,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previ return ~crc; } -uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32) { - uint32_t crc = ~previousCrc32; +uint32_t aws_checksums_crc32_armv8(const uint8_t *data, int length, uint32_t previous_crc32) { + uint32_t crc = ~previous_crc32; // Align data if it's not aligned while (((uintptr_t)data & 7) && length > 0) { diff --git a/source/arm/crc64_arm.c b/source/arm/crc64_arm.c new file mode 100644 index 0000000..a33298e --- /dev/null +++ b/source/arm/crc64_arm.c @@ -0,0 +1,208 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include +#include + +#if INTPTR_MAX == INT64_MAX && defined(AWS_HAVE_ARMv8_1) + +# include + +// Load a uint8x16_t neon register from uint8_t pointer +# define load_u8(uint8_t_ptr) vld1q_u8((uint8_t_ptr)) +// Load a poly64x2_t neon register from a uint8_t pointer +# define load_p64_u8(uint8_t_ptr) vreinterpretq_p64_u8(load_u8(uint8_t_ptr)) +// Load a poly64x2_t neon register from a uint64_t pointer +# define load_p64(uint64_t_ptr) vreinterpretq_p64_u64(vld1q_u64((uint64_t_ptr))) +// Mask the bytes in a neon uint8x16_t register and preserve 0 to 15 least significant bytes. +# define mask_low_u8(u8, count) vandq_u8(u8, load_u8(aws_checksums_masks_shifts[5] - (intptr_t)(count))) +// Mask the bytes in a neon uint8x16_t register and preserve 0 to 15 most significant bytes. +# define mask_high_u8(u8, count) vandq_u8(u8, load_u8(aws_checksums_masks_shifts[3] + (intptr_t)(count))) +// Mask the bytes in a neon poly64x2_t register and preserve 0 to 15 most significant bytes. +# define mask_high_p64(poly, count) vreinterpretq_p64_u8(mask_high_u8(vreinterpretq_u8_p64(poly), count)) +// Left shift bytes in a neon uint8x16_t register - shift count from 0 to 15. +# define left_shift_u8(u8, count) vqtbl1q_u8(u8, load_u8(aws_checksums_masks_shifts[1] - (intptr_t)(count))) +// Right shift bytes in a neon uint8x16_t register - shift count from 0 to 15. +# define right_shift_u8(u8, count) vqtbl1q_u8(u8, load_u8(aws_checksums_masks_shifts[1] + (intptr_t)(count))) +// Left shift bytes in a neon poly64x2_t register - shift count from 0 to 15. +# define left_shift_p64(poly, count) vreinterpretq_p64_u8(left_shift_u8(vreinterpretq_u8_p64(poly), count)) +// Right shift a neon poly64x2_t register 0 to 15 bytes - imm must be an immediate constant +# define right_shift_imm_p64(poly, imm) \ + vreinterpretq_p64_u8(vextq_u8(vreinterpretq_u8_p64(poly), vdupq_n_u8(0), imm)) +// Carryless multiply the lower 64-bit halves of two poly64x2_t neon registers +# define pmull_lo(a, b) \ + (vreinterpretq_p64_p128(vmull_p64((poly64_t)vreinterpretq_p128_p64(a), (poly64_t)vreinterpretq_p128_p64(b)))) +// Carryless multiply the upper 64-bit halves of two poly64x2_t neon registers +# define pmull_hi(a, b) (vreinterpretq_p64_p128(vmull_high_p64((a), (b)))) +// XOR two neon poly64x2_t registers +# define xor_p64(a, b) vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(a), vreinterpretq_u8_p64(b))) +# if defined(__ARM_FEATURE_SHA3) +// The presence of the ARM SHA3 feature also implies the three-way xor instruction +# define xor3_p64(a, b, c) \ + vreinterpretq_p64_u64( \ + veor3q_u64(vreinterpretq_u64_p64(a), vreinterpretq_u64_p64(b), vreinterpretq_u64_p64(c))) +# else +// Without SHA3, implement three-way xor with two normal xors +# define xor3_p64(a, b, c) xor_p64(xor_p64(a, b), c) +# endif // defined(__ARM_FEATURE_SHA3) + +/** Compute CRC64NVME using ARMv8 NEON +crypto/pmull64 instructions. */ +uint64_t aws_checksums_crc64nvme_arm_pmull(const uint8_t *input, int length, const uint64_t previous_crc64) { + if (!input || length <= 0) { + return previous_crc64; + } + + // the amount of complexity required to handle vector instructions on + // memory regions smaller than an xmm register does not justify the very negligible performance gains + // we would get for using it on an input this small. + if (length < 16) { + return aws_checksums_crc64nvme_sw(input, length, previous_crc64); + } + + // Invert the previous crc bits and load into the lower half of a neon register + poly64x2_t a1 = vreinterpretq_p64_u64(vcombine_u64(vcreate_u64(~previous_crc64), vcreate_u64(0))); + + // Load the x^128 and x^192 constants - they'll (very likely) be needed + const poly64x2_t x128 = load_p64(aws_checksums_crc64nvme_constants.x128); + + // Load the next 16 bytes of input and XOR with the previous crc + a1 = xor_p64(a1, load_p64_u8(input)); + input += 16; + length -= 16; + + if (length < 112) { + + const poly64x2_t x256 = load_p64(aws_checksums_crc64nvme_constants.x256); + + if (length & 64) { + // Fold the current crc register with 64 bytes of input by multiplying 64-bit chunks by x^576 through + // x^128 + const poly64x2_t x512 = load_p64(aws_checksums_crc64nvme_constants.x512); + const poly64x2_t x384 = load_p64(aws_checksums_crc64nvme_constants.x384); + poly64x2_t b1 = load_p64_u8(input + 0); + poly64x2_t c1 = load_p64_u8(input + 16); + poly64x2_t d1 = load_p64_u8(input + 32); + poly64x2_t e1 = load_p64_u8(input + 48); + a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), pmull_lo(x384, b1)); + b1 = xor3_p64(pmull_hi(x384, b1), pmull_lo(x256, c1), pmull_hi(x256, c1)); + c1 = xor3_p64(pmull_lo(x128, d1), pmull_hi(x128, d1), e1); + a1 = xor3_p64(a1, b1, c1); + input += 64; + } + + if (length & 32) { + // Fold the current running value with 32 bytes of input by multiplying 64-bit chunks by x^320 through + // x^128 + poly64x2_t b1 = load_p64_u8(input + 0); + poly64x2_t c1 = load_p64_u8(input + 16); + a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1)); + a1 = xor3_p64(a1, pmull_lo(x128, b1), pmull_hi(x128, b1)); + input += 32; + } + } else { // There are 112 or more bytes of input + + const poly64x2_t x1024 = load_p64(aws_checksums_crc64nvme_constants.x1024); + + // Load another 112 bytes of input + poly64x2_t b1 = load_p64_u8(input + 0); + poly64x2_t c1 = load_p64_u8(input + 16); + poly64x2_t d1 = load_p64_u8(input + 32); + poly64x2_t e1 = load_p64_u8(input + 48); + poly64x2_t f1 = load_p64_u8(input + 64); + poly64x2_t g1 = load_p64_u8(input + 80); + poly64x2_t h1 = load_p64_u8(input + 96); + input += 112; + length -= 112; + + // Spin through additional chunks of 128 bytes, if any + int loops = length / 128; + while (loops--) { + // Fold input values in parallel by multiplying by x^1088 and x^1024 constants + a1 = xor3_p64(pmull_lo(x1024, a1), pmull_hi(x1024, a1), load_p64_u8(input + 0)); + b1 = xor3_p64(pmull_lo(x1024, b1), pmull_hi(x1024, b1), load_p64_u8(input + 16)); + c1 = xor3_p64(pmull_lo(x1024, c1), pmull_hi(x1024, c1), load_p64_u8(input + 32)); + d1 = xor3_p64(pmull_lo(x1024, d1), pmull_hi(x1024, d1), load_p64_u8(input + 48)); + e1 = xor3_p64(pmull_lo(x1024, e1), pmull_hi(x1024, e1), load_p64_u8(input + 64)); + f1 = xor3_p64(pmull_lo(x1024, f1), pmull_hi(x1024, f1), load_p64_u8(input + 80)); + g1 = xor3_p64(pmull_lo(x1024, g1), pmull_hi(x1024, g1), load_p64_u8(input + 96)); + h1 = xor3_p64(pmull_lo(x1024, h1), pmull_hi(x1024, h1), load_p64_u8(input + 112)); + input += 128; + } + + // Fold 128 bytes down to 64 bytes by multiplying by the x^576 and x^512 constants + const poly64x2_t x512 = load_p64(aws_checksums_crc64nvme_constants.x512); + a1 = xor3_p64(e1, pmull_lo(x512, a1), pmull_hi(x512, a1)); + b1 = xor3_p64(f1, pmull_lo(x512, b1), pmull_hi(x512, b1)); + c1 = xor3_p64(g1, pmull_lo(x512, c1), pmull_hi(x512, c1)); + d1 = xor3_p64(h1, pmull_lo(x512, d1), pmull_hi(x512, d1)); + + if (length & 64) { + // Fold the current 64 bytes with 64 bytes of input by multiplying by x^576 and x^512 constants + a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), load_p64_u8(input + 0)); + b1 = xor3_p64(pmull_lo(x512, b1), pmull_hi(x512, b1), load_p64_u8(input + 16)); + c1 = xor3_p64(pmull_lo(x512, c1), pmull_hi(x512, c1), load_p64_u8(input + 32)); + d1 = xor3_p64(pmull_lo(x512, d1), pmull_hi(x512, d1), load_p64_u8(input + 48)); + input += 64; + } + + // Fold 64 bytes down to 32 bytes by multiplying by the x^320 and x^256 constants + const poly64x2_t x256 = load_p64(aws_checksums_crc64nvme_constants.x256); + a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1)); + b1 = xor3_p64(d1, pmull_lo(x256, b1), pmull_hi(x256, b1)); + + if (length & 32) { + // Fold the current running value with 32 bytes of input by multiplying by x^320 and x^256 constants + a1 = xor3_p64(pmull_lo(x256, a1), pmull_hi(x256, a1), load_p64_u8(input + 0)); + b1 = xor3_p64(pmull_lo(x256, b1), pmull_hi(x256, b1), load_p64_u8(input + 16)); + input += 32; + } + + // Fold 32 bytes down to 16 bytes by multiplying by x^192 and x^128 constants + a1 = xor3_p64(b1, pmull_lo(x128, a1), pmull_hi(x128, a1)); + } + + if (length & 16) { + // Fold the current 16 bytes with 16 bytes of input by multiplying by x^192 and x^128 constants + a1 = xor3_p64(pmull_lo(x128, a1), pmull_hi(x128, a1), load_p64_u8(input + 0)); + input += 16; + } + + // There must only be 0-15 bytes of input left + length &= 15; + + if (length == 0) { + // Multiply the lower half of the crc register by x^128 (swapping upper and lower halves) + poly64x2_t mul_by_x128 = pmull_lo(a1, vextq_p64(x128, x128, 1)); + // XOR the result with the right shifted upper half of the crc + a1 = xor_p64(right_shift_imm_p64(a1, 8), mul_by_x128); + } else { + // Handle any trailing input from 1-15 bytes + const poly64x2_t trailing_constants = load_p64(aws_checksums_crc64nvme_constants.trailing[length - 1]); + // Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input + a1 = xor_p64(pmull_lo(a1, trailing_constants), pmull_hi(a1, trailing_constants)); + // Safely load ending at the last byte of trailing input and mask out any leading garbage + poly64x2_t trailing_input = mask_high_p64(load_p64_u8(input + length - 16), length); + // Multiply the lower half of the trailing input register by x^128 (swapping x^192 and x^128 halves) + poly64x2_t mul_by_x128 = pmull_lo(trailing_input, vextq_p64(x128, x128, 1)); + // XOR the results with the right shifted upper half of the trailing input + a1 = xor3_p64(a1, right_shift_imm_p64(trailing_input, 8), mul_by_x128); + } + + // Barrett modular reduction + + // Load the Barrett mu and (bit-reflected) polynomial + const poly64x2_t mu_poly = load_p64(aws_checksums_crc64nvme_constants.mu_poly); + // Multiply the lower half of the crc register by mu (mu is in the lower half of mu_poly) + poly64x2_t mul_by_mu = pmull_lo(a1, mu_poly); + // Multiply lower half of mul_by_mu result by poly (which is swapped into the lower half) + poly64x2_t mul_by_poly = pmull_lo(mul_by_mu, vextq_p64(mu_poly, mu_poly, 1)); + // Swap halves of mul_by_mu and add the upper halves of everything + poly64x2_t result = xor3_p64(a1, vextq_p64(mul_by_mu, mul_by_mu, 1), mul_by_poly); + + // Reduction result is the upper half - invert the bits before returning the crc + return ~vgetq_lane_u64(vreinterpretq_u64_p64(result), 1); +} + +#endif // INTPTR_MAX == INT64_MAX && defined(AWS_HAVE_ARMv8_1) diff --git a/source/crc.c b/source/crc.c index f5d3e80..f5b6a7d 100644 --- a/source/crc.c +++ b/source/crc.c @@ -7,27 +7,42 @@ #include -static uint32_t (*s_crc32c_fn_ptr)(const uint8_t *input, int length, uint32_t previousCrc32) = 0; -static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t previousCrc32) = 0; +static uint32_t (*s_crc32c_fn_ptr)(const uint8_t *input, int length, uint32_t previous_crc32c) = 0; +static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t previous_crc32) = 0; -uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) { +uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32) { if (AWS_UNLIKELY(!s_crc32_fn_ptr)) { +#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64) if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { - s_crc32_fn_ptr = aws_checksums_crc32_hw; + s_crc32_fn_ptr = aws_checksums_crc32_armv8; } else { s_crc32_fn_ptr = aws_checksums_crc32_sw; } +#else + s_crc32_fn_ptr = aws_checksums_crc32_sw; +#endif } - return s_crc32_fn_ptr(input, length, previousCrc32); + return s_crc32_fn_ptr(input, length, previous_crc32); } -uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32) { +uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previous_crc32c) { if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) { - if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { - s_crc32c_fn_ptr = aws_checksums_crc32c_hw; +#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64) + if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2)) { + s_crc32c_fn_ptr = aws_checksums_crc32c_intel_avx512_with_sse_fallback; } else { s_crc32c_fn_ptr = aws_checksums_crc32c_sw; } +#elif defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64) + if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { + s_crc32c_fn_ptr = aws_checksums_crc32c_armv8; + } else { + s_crc32c_fn_ptr = aws_checksums_crc32c_sw; + } +#else + s_crc32c_fn_ptr = aws_checksums_crc32c_sw; +#endif } - return s_crc32c_fn_ptr(input, length, previousCrc32); + + return s_crc32c_fn_ptr(input, length, previous_crc32c); } diff --git a/source/crc64.c b/source/crc64.c new file mode 100644 index 0000000..e533d1e --- /dev/null +++ b/source/crc64.c @@ -0,0 +1,124 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include +#include +#include + +AWS_ALIGNED_TYPEDEF(uint8_t, checksums_maxks_shifts_type[6][16], 16); +// Intel PSHUFB / ARM VTBL patterns for left/right shifts and masks +checksums_maxks_shifts_type aws_checksums_masks_shifts = { + {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, // + {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}, // left/right + // shifts + {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, // + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, // + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, // byte masks + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, // +}; + +AWS_ALIGNED_TYPEDEF(aws_checksums_crc64_constants_t, checksums_constants, 16); + +/* clang-format off */ + +// Pre-computed bit-reflected constants for CRC64NVME +// The actual exponents are reduced by 1 to compensate for bit-reflection (e.g. x^1024 is actually x^1023) +checksums_constants aws_checksums_crc64nvme_constants = { + .x2048 = + {0x37ccd3e14069cabc, + 0xa043808c0f782663, // x^2112 mod P(x) / x^2048 mod P(x) + 0x37ccd3e14069cabc, + 0xa043808c0f782663, // duplicated 3 times to support 64 byte avx512 loads + 0x37ccd3e14069cabc, + 0xa043808c0f782663, + 0x37ccd3e14069cabc, + 0xa043808c0f782663}, + .x1536 = + {0x758ee09da263e275, + 0x6d2d13de8038b4ca, // x^1600 mod P(x) / x^1536 mod P(x) + 0x758ee09da263e275, + 0x6d2d13de8038b4ca, // duplicated 3 times to support 64 byte avx512 loads + 0x758ee09da263e275, + 0x6d2d13de8038b4ca, + 0x758ee09da263e275, + 0x6d2d13de8038b4ca}, + .x1024 = + {0xa1ca681e733f9c40, + 0x5f852fb61e8d92dc, // x^1088 mod P(x) / x^1024 mod P(x) + 0xa1ca681e733f9c40, + 0x5f852fb61e8d92dc, // duplicated 3 times to support 64 byte avx512 loads + 0xa1ca681e733f9c40, + 0x5f852fb61e8d92dc, + 0xa1ca681e733f9c40, + 0x5f852fb61e8d92dc}, + .x512 = + {0x0c32cdb31e18a84a, + 0x62242240ace5045a, // x^576 mod P(x) / x^512 mod P(x) + 0x0c32cdb31e18a84a, + 0x62242240ace5045a, // duplicated 3 times to support 64 byte avx512 loads + 0x0c32cdb31e18a84a, + 0x62242240ace5045a, + 0x0c32cdb31e18a84a, + 0x62242240ace5045a}, + .x384 = {0xbdd7ac0ee1a4a0f0, 0xa3ffdc1fe8e82a8b}, // x^448 mod P(x) / x^384 mod P(x) + .x256 = {0xb0bc2e589204f500, 0xe1e0bb9d45d7a44c}, // x^320 mod P(x) / x^256 mod P(x) + .x128 = {0xeadc41fd2ba3d420, 0x21e9761e252621ac}, // x^192 mod P(x) / x^128 mod P(x) + .mu_poly = {0x27ecfa329aef9f77, 0x34d926535897936b}, // Barrett mu / polynomial P(x) (bit-reflected) + .trailing = + { + // trailing input constants for data lengths of 1-15 bytes + {0x04f28def5347786c, 0x7f6ef0c830358979}, // 1 trailing bytes: x^72 mod P(x) / x^8 mod P(x) + {0x49e1df807414fdef, 0x8776a97d73bddf69}, // 2 trailing bytes: x^80 mod P(x) / x^15 mod P(x) + {0x52734ea3e726fc54, 0xff6e4e1f4e4038be}, // 3 trailing bytes: x^88 mod P(x) / x^24 mod P(x) + {0x668ab3bbc976d29d, 0x8211147cbaf96306}, // 4 trailing bytes: x^96 mod P(x) / x^32 mod P(x) + {0xf2fa1fae5f5c1165, 0x373d15f784905d1e}, // 5 trailing bytes: x^104 mod P(x) / x^40 mod P(x) + {0x9065cb6e6d39918a, 0xe9742a79ef04a5d4}, // 6 trailing bytes: x^110 mod P(x) / x^48 mod P(x) + {0xc23dfbc6ca591ca3, 0xfc5d27f6bf353971}, // 7 trailing bytes: x^110 mod P(x) / x^56 mod P(x) + {0xeadc41fd2ba3d420, 0x21e9761e252621ac}, // 8 trailing bytes: x^120 mod P(x) / x^64 mod P(x) + {0xf12b2236ec577cd6, 0x04f28def5347786c}, // 9 trailing bytes: x^128 mod P(x) / x^72 mod P(x) + {0x0298996e905d785a, 0x49e1df807414fdef}, // 10 trailing bytes: x^144 mod P(x) / x^80 mod P(x) + {0xf779b03b943ff311, 0x52734ea3e726fc54}, // 11 trailing bytes: x^152 mod P(x) / x^88 mod P(x) + {0x07797643831fd90b, 0x668ab3bbc976d29d}, // 12 trailing bytes: x^160 mod P(x) / x^96 mod P(x) + {0x27a8849a7bc97a27, 0xf2fa1fae5f5c1165}, // 13 trailing bytes: x^168 mod P(x) / x^104 mod P(x) + {0xb937a2d843183b7c, 0x9065cb6e6d39918a}, // 14 trailing bytes: x^176 mod P(x) / x^112 mod P(x) + {0x31bce594cbbacd2d, 0xc23dfbc6ca591ca3}, // 15 trailing bytes: x^184 mod P(x) / x^120 mod P(x) + }, +}; +/* clang-format on */ + +static uint64_t (*s_crc64nvme_fn_ptr)(const uint8_t *input, int length, uint64_t prev_crc64) = 0; + +uint64_t aws_checksums_crc64nvme(const uint8_t *input, int length, uint64_t prev_crc64) { + if (AWS_UNLIKELY(!s_crc64nvme_fn_ptr)) { +#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64) && !(defined(_MSC_VER) && _MSC_VER < 1920) +# if defined(AWS_HAVE_AVX512_INTRINSICS) + if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) && aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ)) { + s_crc64nvme_fn_ptr = aws_checksums_crc64nvme_intel_avx512; + } else +# endif +# if defined(AWS_HAVE_CLMUL) && defined(AWS_HAVE_AVX2_INTRINSICS) + if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) && aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2)) { + s_crc64nvme_fn_ptr = aws_checksums_crc64nvme_intel_clmul; + } else { + s_crc64nvme_fn_ptr = aws_checksums_crc64nvme_sw; + } +# endif +# if !(defined(AWS_HAVE_AVX512_INTRINSICS) || (defined(AWS_HAVE_CLMUL) && defined(AWS_HAVE_AVX2_INTRINSICS))) + s_crc64nvme_fn_ptr = aws_checksums_crc64nvme_sw; +# endif + +#elif defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64) && defined(AWS_HAVE_ARMv8_1) + if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRYPTO) && aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_PMULL)) { + s_crc64nvme_fn_ptr = aws_checksums_crc64nvme_arm_pmull; + } else { + s_crc64nvme_fn_ptr = aws_checksums_crc64nvme_sw; + } +#else // this branch being taken means it's not arm64 and not intel with avx extensions + s_crc64nvme_fn_ptr = aws_checksums_crc64nvme_sw; +#endif + } + + return s_crc64nvme_fn_ptr(input, length, prev_crc64); +} diff --git a/source/crc64_sw.c b/source/crc64_sw.c new file mode 100644 index 0000000..59123b3 --- /dev/null +++ b/source/crc64_sw.c @@ -0,0 +1,579 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include +#include + +// CRC64NVME slice-by-8 lookup table (bit-reflected poly 0x9a6c9329ac4bc9b5) +static uint64_t crc64nvme_table[8][256] = { + // + { + 0x0000000000000000, 0x7f6ef0c830358979, 0xfedde190606b12f2, 0x81b31158505e9b8b, // [0][0x00] + 0xc962e5739841b68f, 0xb60c15bba8743ff6, 0x37bf04e3f82aa47d, 0x48d1f42bc81f2d04, // [0][0x04] + 0xa61cecb46814fe75, 0xd9721c7c5821770c, 0x58c10d24087fec87, 0x27affdec384a65fe, // [0][0x08] + 0x6f7e09c7f05548fa, 0x1010f90fc060c183, 0x91a3e857903e5a08, 0xeecd189fa00bd371, // [0][0x0c] + 0x78e0ff3b88be6f81, 0x078e0ff3b88be6f8, 0x863d1eabe8d57d73, 0xf953ee63d8e0f40a, // [0][0x10] + 0xb1821a4810ffd90e, 0xceecea8020ca5077, 0x4f5ffbd87094cbfc, 0x30310b1040a14285, // [0][0x14] + 0xdefc138fe0aa91f4, 0xa192e347d09f188d, 0x2021f21f80c18306, 0x5f4f02d7b0f40a7f, // [0][0x18] + 0x179ef6fc78eb277b, 0x68f0063448deae02, 0xe943176c18803589, 0x962de7a428b5bcf0, // [0][0x1c] + 0xf1c1fe77117cdf02, 0x8eaf0ebf2149567b, 0x0f1c1fe77117cdf0, 0x7072ef2f41224489, // [0][0x20] + 0x38a31b04893d698d, 0x47cdebccb908e0f4, 0xc67efa94e9567b7f, 0xb9100a5cd963f206, // [0][0x24] + 0x57dd12c379682177, 0x28b3e20b495da80e, 0xa900f35319033385, 0xd66e039b2936bafc, // [0][0x28] + 0x9ebff7b0e12997f8, 0xe1d10778d11c1e81, 0x606216208142850a, 0x1f0ce6e8b1770c73, // [0][0x2c] + 0x8921014c99c2b083, 0xf64ff184a9f739fa, 0x77fce0dcf9a9a271, 0x08921014c99c2b08, // [0][0x30] + 0x4043e43f0183060c, 0x3f2d14f731b68f75, 0xbe9e05af61e814fe, 0xc1f0f56751dd9d87, // [0][0x34] + 0x2f3dedf8f1d64ef6, 0x50531d30c1e3c78f, 0xd1e00c6891bd5c04, 0xae8efca0a188d57d, // [0][0x38] + 0xe65f088b6997f879, 0x9931f84359a27100, 0x1882e91b09fcea8b, 0x67ec19d339c963f2, // [0][0x3c] + 0xd75adabd7a6e2d6f, 0xa8342a754a5ba416, 0x29873b2d1a053f9d, 0x56e9cbe52a30b6e4, // [0][0x40] + 0x1e383fcee22f9be0, 0x6156cf06d21a1299, 0xe0e5de5e82448912, 0x9f8b2e96b271006b, // [0][0x44] + 0x71463609127ad31a, 0x0e28c6c1224f5a63, 0x8f9bd7997211c1e8, 0xf0f5275142244891, // [0][0x48] + 0xb824d37a8a3b6595, 0xc74a23b2ba0eecec, 0x46f932eaea507767, 0x3997c222da65fe1e, // [0][0x4c] + 0xafba2586f2d042ee, 0xd0d4d54ec2e5cb97, 0x5167c41692bb501c, 0x2e0934dea28ed965, // [0][0x50] + 0x66d8c0f56a91f461, 0x19b6303d5aa47d18, 0x980521650afae693, 0xe76bd1ad3acf6fea, // [0][0x54] + 0x09a6c9329ac4bc9b, 0x76c839faaaf135e2, 0xf77b28a2faafae69, 0x8815d86aca9a2710, // [0][0x58] + 0xc0c42c4102850a14, 0xbfaadc8932b0836d, 0x3e19cdd162ee18e6, 0x41773d1952db919f, // [0][0x5c] + 0x269b24ca6b12f26d, 0x59f5d4025b277b14, 0xd846c55a0b79e09f, 0xa72835923b4c69e6, // [0][0x60] + 0xeff9c1b9f35344e2, 0x90973171c366cd9b, 0x1124202993385610, 0x6e4ad0e1a30ddf69, // [0][0x64] + 0x8087c87e03060c18, 0xffe938b633338561, 0x7e5a29ee636d1eea, 0x0134d92653589793, // [0][0x68] + 0x49e52d0d9b47ba97, 0x368bddc5ab7233ee, 0xb738cc9dfb2ca865, 0xc8563c55cb19211c, // [0][0x6c] + 0x5e7bdbf1e3ac9dec, 0x21152b39d3991495, 0xa0a63a6183c78f1e, 0xdfc8caa9b3f20667, // [0][0x70] + 0x97193e827bed2b63, 0xe877ce4a4bd8a21a, 0x69c4df121b863991, 0x16aa2fda2bb3b0e8, // [0][0x74] + 0xf86737458bb86399, 0x8709c78dbb8deae0, 0x06bad6d5ebd3716b, 0x79d4261ddbe6f812, // [0][0x78] + 0x3105d23613f9d516, 0x4e6b22fe23cc5c6f, 0xcfd833a67392c7e4, 0xb0b6c36e43a74e9d, // [0][0x7c] + 0x9a6c9329ac4bc9b5, 0xe50263e19c7e40cc, 0x64b172b9cc20db47, 0x1bdf8271fc15523e, // [0][0x80] + 0x530e765a340a7f3a, 0x2c608692043ff643, 0xadd397ca54616dc8, 0xd2bd67026454e4b1, // [0][0x84] + 0x3c707f9dc45f37c0, 0x431e8f55f46abeb9, 0xc2ad9e0da4342532, 0xbdc36ec59401ac4b, // [0][0x88] + 0xf5129aee5c1e814f, 0x8a7c6a266c2b0836, 0x0bcf7b7e3c7593bd, 0x74a18bb60c401ac4, // [0][0x8c] + 0xe28c6c1224f5a634, 0x9de29cda14c02f4d, 0x1c518d82449eb4c6, 0x633f7d4a74ab3dbf, // [0][0x90] + 0x2bee8961bcb410bb, 0x548079a98c8199c2, 0xd53368f1dcdf0249, 0xaa5d9839ecea8b30, // [0][0x94] + 0x449080a64ce15841, 0x3bfe706e7cd4d138, 0xba4d61362c8a4ab3, 0xc52391fe1cbfc3ca, // [0][0x98] + 0x8df265d5d4a0eece, 0xf29c951de49567b7, 0x732f8445b4cbfc3c, 0x0c41748d84fe7545, // [0][0x9c] + 0x6bad6d5ebd3716b7, 0x14c39d968d029fce, 0x95708ccedd5c0445, 0xea1e7c06ed698d3c, // [0][0xa0] + 0xa2cf882d2576a038, 0xdda178e515432941, 0x5c1269bd451db2ca, 0x237c997575283bb3, // [0][0xa4] + 0xcdb181ead523e8c2, 0xb2df7122e51661bb, 0x336c607ab548fa30, 0x4c0290b2857d7349, // [0][0xa8] + 0x04d364994d625e4d, 0x7bbd94517d57d734, 0xfa0e85092d094cbf, 0x856075c11d3cc5c6, // [0][0xac] + 0x134d926535897936, 0x6c2362ad05bcf04f, 0xed9073f555e26bc4, 0x92fe833d65d7e2bd, // [0][0xb0] + 0xda2f7716adc8cfb9, 0xa54187de9dfd46c0, 0x24f29686cda3dd4b, 0x5b9c664efd965432, // [0][0xb4] + 0xb5517ed15d9d8743, 0xca3f8e196da80e3a, 0x4b8c9f413df695b1, 0x34e26f890dc31cc8, // [0][0xb8] + 0x7c339ba2c5dc31cc, 0x035d6b6af5e9b8b5, 0x82ee7a32a5b7233e, 0xfd808afa9582aa47, // [0][0xbc] + 0x4d364994d625e4da, 0x3258b95ce6106da3, 0xb3eba804b64ef628, 0xcc8558cc867b7f51, // [0][0xc0] + 0x8454ace74e645255, 0xfb3a5c2f7e51db2c, 0x7a894d772e0f40a7, 0x05e7bdbf1e3ac9de, // [0][0xc4] + 0xeb2aa520be311aaf, 0x944455e88e0493d6, 0x15f744b0de5a085d, 0x6a99b478ee6f8124, // [0][0xc8] + 0x224840532670ac20, 0x5d26b09b16452559, 0xdc95a1c3461bbed2, 0xa3fb510b762e37ab, // [0][0xcc] + 0x35d6b6af5e9b8b5b, 0x4ab846676eae0222, 0xcb0b573f3ef099a9, 0xb465a7f70ec510d0, // [0][0xd0] + 0xfcb453dcc6da3dd4, 0x83daa314f6efb4ad, 0x0269b24ca6b12f26, 0x7d0742849684a65f, // [0][0xd4] + 0x93ca5a1b368f752e, 0xeca4aad306bafc57, 0x6d17bb8b56e467dc, 0x12794b4366d1eea5, // [0][0xd8] + 0x5aa8bf68aecec3a1, 0x25c64fa09efb4ad8, 0xa4755ef8cea5d153, 0xdb1bae30fe90582a, // [0][0xdc] + 0xbcf7b7e3c7593bd8, 0xc399472bf76cb2a1, 0x422a5673a732292a, 0x3d44a6bb9707a053, // [0][0xe0] + 0x759552905f188d57, 0x0afba2586f2d042e, 0x8b48b3003f739fa5, 0xf42643c80f4616dc, // [0][0xe4] + 0x1aeb5b57af4dc5ad, 0x6585ab9f9f784cd4, 0xe436bac7cf26d75f, 0x9b584a0fff135e26, // [0][0xe8] + 0xd389be24370c7322, 0xace74eec0739fa5b, 0x2d545fb4576761d0, 0x523aaf7c6752e8a9, // [0][0xec] + 0xc41748d84fe75459, 0xbb79b8107fd2dd20, 0x3acaa9482f8c46ab, 0x45a459801fb9cfd2, // [0][0xf0] + 0x0d75adabd7a6e2d6, 0x721b5d63e7936baf, 0xf3a84c3bb7cdf024, 0x8cc6bcf387f8795d, // [0][0xf4] + 0x620ba46c27f3aa2c, 0x1d6554a417c62355, 0x9cd645fc4798b8de, 0xe3b8b53477ad31a7, // [0][0xf8] + 0xab69411fbfb21ca3, 0xd407b1d78f8795da, 0x55b4a08fdfd90e51, 0x2ada5047efec8728 // [0][0xfc] + }, + { + 0x0000000000000000, 0x8776a97d73bddf69, 0x3a3474a9bfec2db9, 0xbd42ddd4cc51f2d0, // [1][0x00] + 0x7468e9537fd85b72, 0xf31e402e0c65841b, 0x4e5c9dfac03476cb, 0xc92a3487b389a9a2, // [1][0x04] + 0xe8d1d2a6ffb0b6e4, 0x6fa77bdb8c0d698d, 0xd2e5a60f405c9b5d, 0x55930f7233e14434, // [1][0x08] + 0x9cb93bf58068ed96, 0x1bcf9288f3d532ff, 0xa68d4f5c3f84c02f, 0x21fbe6214c391f46, // [1][0x0c] + 0xe57a831ea7f6fea3, 0x620c2a63d44b21ca, 0xdf4ef7b7181ad31a, 0x58385eca6ba70c73, // [1][0x10] + 0x91126a4dd82ea5d1, 0x1664c330ab937ab8, 0xab261ee467c28868, 0x2c50b799147f5701, // [1][0x14] + 0x0dab51b858464847, 0x8addf8c52bfb972e, 0x379f2511e7aa65fe, 0xb0e98c6c9417ba97, // [1][0x18] + 0x79c3b8eb279e1335, 0xfeb511965423cc5c, 0x43f7cc4298723e8c, 0xc481653febcfe1e5, // [1][0x1c] + 0xfe2c206e177a6e2d, 0x795a891364c7b144, 0xc41854c7a8964394, 0x436efdbadb2b9cfd, // [1][0x20] + 0x8a44c93d68a2355f, 0x0d3260401b1fea36, 0xb070bd94d74e18e6, 0x370614e9a4f3c78f, // [1][0x24] + 0x16fdf2c8e8cad8c9, 0x918b5bb59b7707a0, 0x2cc986615726f570, 0xabbf2f1c249b2a19, // [1][0x28] + 0x62951b9b971283bb, 0xe5e3b2e6e4af5cd2, 0x58a16f3228feae02, 0xdfd7c64f5b43716b, // [1][0x2c] + 0x1b56a370b08c908e, 0x9c200a0dc3314fe7, 0x2162d7d90f60bd37, 0xa6147ea47cdd625e, // [1][0x30] + 0x6f3e4a23cf54cbfc, 0xe848e35ebce91495, 0x550a3e8a70b8e645, 0xd27c97f70305392c, // [1][0x34] + 0xf38771d64f3c266a, 0x74f1d8ab3c81f903, 0xc9b3057ff0d00bd3, 0x4ec5ac02836dd4ba, // [1][0x38] + 0x87ef988530e47d18, 0x009931f84359a271, 0xbddbec2c8f0850a1, 0x3aad4551fcb58fc8, // [1][0x3c] + 0xc881668f76634f31, 0x4ff7cff205de9058, 0xf2b51226c98f6288, 0x75c3bb5bba32bde1, // [1][0x40] + 0xbce98fdc09bb1443, 0x3b9f26a17a06cb2a, 0x86ddfb75b65739fa, 0x01ab5208c5eae693, // [1][0x44] + 0x2050b42989d3f9d5, 0xa7261d54fa6e26bc, 0x1a64c080363fd46c, 0x9d1269fd45820b05, // [1][0x48] + 0x54385d7af60ba2a7, 0xd34ef40785b67dce, 0x6e0c29d349e78f1e, 0xe97a80ae3a5a5077, // [1][0x4c] + 0x2dfbe591d195b192, 0xaa8d4ceca2286efb, 0x17cf91386e799c2b, 0x90b938451dc44342, // [1][0x50] + 0x59930cc2ae4deae0, 0xdee5a5bfddf03589, 0x63a7786b11a1c759, 0xe4d1d116621c1830, // [1][0x54] + 0xc52a37372e250776, 0x425c9e4a5d98d81f, 0xff1e439e91c92acf, 0x7868eae3e274f5a6, // [1][0x58] + 0xb142de6451fd5c04, 0x363477192240836d, 0x8b76aacdee1171bd, 0x0c0003b09dacaed4, // [1][0x5c] + 0x36ad46e16119211c, 0xb1dbef9c12a4fe75, 0x0c993248def50ca5, 0x8bef9b35ad48d3cc, // [1][0x60] + 0x42c5afb21ec17a6e, 0xc5b306cf6d7ca507, 0x78f1db1ba12d57d7, 0xff877266d29088be, // [1][0x64] + 0xde7c94479ea997f8, 0x590a3d3aed144891, 0xe448e0ee2145ba41, 0x633e499352f86528, // [1][0x68] + 0xaa147d14e171cc8a, 0x2d62d46992cc13e3, 0x902009bd5e9de133, 0x1756a0c02d203e5a, // [1][0x6c] + 0xd3d7c5ffc6efdfbf, 0x54a16c82b55200d6, 0xe9e3b1567903f206, 0x6e95182b0abe2d6f, // [1][0x70] + 0xa7bf2cacb93784cd, 0x20c985d1ca8a5ba4, 0x9d8b580506dba974, 0x1afdf1787566761d, // [1][0x74] + 0x3b061759395f695b, 0xbc70be244ae2b632, 0x013263f086b344e2, 0x8644ca8df50e9b8b, // [1][0x78] + 0x4f6efe0a46873229, 0xc8185777353aed40, 0x755a8aa3f96b1f90, 0xf22c23de8ad6c0f9, // [1][0x7c] + 0xa5dbeb4db4510d09, 0x22ad4230c7ecd260, 0x9fef9fe40bbd20b0, 0x189936997800ffd9, // [1][0x80] + 0xd1b3021ecb89567b, 0x56c5ab63b8348912, 0xeb8776b774657bc2, 0x6cf1dfca07d8a4ab, // [1][0x84] + 0x4d0a39eb4be1bbed, 0xca7c9096385c6484, 0x773e4d42f40d9654, 0xf048e43f87b0493d, // [1][0x88] + 0x3962d0b83439e09f, 0xbe1479c547843ff6, 0x0356a4118bd5cd26, 0x84200d6cf868124f, // [1][0x8c] + 0x40a1685313a7f3aa, 0xc7d7c12e601a2cc3, 0x7a951cfaac4bde13, 0xfde3b587dff6017a, // [1][0x90] + 0x34c981006c7fa8d8, 0xb3bf287d1fc277b1, 0x0efdf5a9d3938561, 0x898b5cd4a02e5a08, // [1][0x94] + 0xa870baf5ec17454e, 0x2f0613889faa9a27, 0x9244ce5c53fb68f7, 0x153267212046b79e, // [1][0x98] + 0xdc1853a693cf1e3c, 0x5b6efadbe072c155, 0xe62c270f2c233385, 0x615a8e725f9eecec, // [1][0x9c] + 0x5bf7cb23a32b6324, 0xdc81625ed096bc4d, 0x61c3bf8a1cc74e9d, 0xe6b516f76f7a91f4, // [1][0xa0] + 0x2f9f2270dcf33856, 0xa8e98b0daf4ee73f, 0x15ab56d9631f15ef, 0x92ddffa410a2ca86, // [1][0xa4] + 0xb32619855c9bd5c0, 0x3450b0f82f260aa9, 0x89126d2ce377f879, 0x0e64c45190ca2710, // [1][0xa8] + 0xc74ef0d623438eb2, 0x403859ab50fe51db, 0xfd7a847f9cafa30b, 0x7a0c2d02ef127c62, // [1][0xac] + 0xbe8d483d04dd9d87, 0x39fbe140776042ee, 0x84b93c94bb31b03e, 0x03cf95e9c88c6f57, // [1][0xb0] + 0xcae5a16e7b05c6f5, 0x4d93081308b8199c, 0xf0d1d5c7c4e9eb4c, 0x77a77cbab7543425, // [1][0xb4] + 0x565c9a9bfb6d2b63, 0xd12a33e688d0f40a, 0x6c68ee32448106da, 0xeb1e474f373cd9b3, // [1][0xb8] + 0x223473c884b57011, 0xa542dab5f708af78, 0x180007613b595da8, 0x9f76ae1c48e482c1, // [1][0xbc] + 0x6d5a8dc2c2324238, 0xea2c24bfb18f9d51, 0x576ef96b7dde6f81, 0xd01850160e63b0e8, // [1][0xc0] + 0x19326491bdea194a, 0x9e44cdecce57c623, 0x23061038020634f3, 0xa470b94571bbeb9a, // [1][0xc4] + 0x858b5f643d82f4dc, 0x02fdf6194e3f2bb5, 0xbfbf2bcd826ed965, 0x38c982b0f1d3060c, // [1][0xc8] + 0xf1e3b637425aafae, 0x76951f4a31e770c7, 0xcbd7c29efdb68217, 0x4ca16be38e0b5d7e, // [1][0xcc] + 0x88200edc65c4bc9b, 0x0f56a7a1167963f2, 0xb2147a75da289122, 0x3562d308a9954e4b, // [1][0xd0] + 0xfc48e78f1a1ce7e9, 0x7b3e4ef269a13880, 0xc67c9326a5f0ca50, 0x410a3a5bd64d1539, // [1][0xd4] + 0x60f1dc7a9a740a7f, 0xe7877507e9c9d516, 0x5ac5a8d3259827c6, 0xddb301ae5625f8af, // [1][0xd8] + 0x14993529e5ac510d, 0x93ef9c5496118e64, 0x2ead41805a407cb4, 0xa9dbe8fd29fda3dd, // [1][0xdc] + 0x9376adacd5482c15, 0x140004d1a6f5f37c, 0xa942d9056aa401ac, 0x2e3470781919dec5, // [1][0xe0] + 0xe71e44ffaa907767, 0x6068ed82d92da80e, 0xdd2a3056157c5ade, 0x5a5c992b66c185b7, // [1][0xe4] + 0x7ba77f0a2af89af1, 0xfcd1d67759454598, 0x41930ba39514b748, 0xc6e5a2dee6a96821, // [1][0xe8] + 0x0fcf96595520c183, 0x88b93f24269d1eea, 0x35fbe2f0eaccec3a, 0xb28d4b8d99713353, // [1][0xec] + 0x760c2eb272bed2b6, 0xf17a87cf01030ddf, 0x4c385a1bcd52ff0f, 0xcb4ef366beef2066, // [1][0xf0] + 0x0264c7e10d6689c4, 0x85126e9c7edb56ad, 0x3850b348b28aa47d, 0xbf261a35c1377b14, // [1][0xf4] + 0x9eddfc148d0e6452, 0x19ab5569feb3bb3b, 0xa4e988bd32e249eb, 0x239f21c0415f9682, // [1][0xf8] + 0xeab51547f2d63f20, 0x6dc3bc3a816be049, 0xd08161ee4d3a1299, 0x57f7c8933e87cdf0 // [1][0xfc] + }, + { + 0x0000000000000000, 0xff6e4e1f4e4038be, 0xca05ba6dc417e217, 0x356bf4728a57daa9, // [2][0x00] + 0xa0d25288d0b85745, 0x5fbc1c979ef86ffb, 0x6ad7e8e514afb552, 0x95b9a6fa5aef8dec, // [2][0x04] + 0x757d8342f9e73de1, 0x8a13cd5db7a7055f, 0xbf78392f3df0dff6, 0x4016773073b0e748, // [2][0x08] + 0xd5afd1ca295f6aa4, 0x2ac19fd5671f521a, 0x1faa6ba7ed4888b3, 0xe0c425b8a308b00d, // [2][0x0c] + 0xeafb0685f3ce7bc2, 0x1595489abd8e437c, 0x20febce837d999d5, 0xdf90f2f77999a16b, // [2][0x10] + 0x4a29540d23762c87, 0xb5471a126d361439, 0x802cee60e761ce90, 0x7f42a07fa921f62e, // [2][0x14] + 0x9f8685c70a294623, 0x60e8cbd844697e9d, 0x55833faace3ea434, 0xaaed71b5807e9c8a, // [2][0x18] + 0x3f54d74fda911166, 0xc03a995094d129d8, 0xf5516d221e86f371, 0x0a3f233d50c6cbcf, // [2][0x1c] + 0xe12f2b58bf0b64ef, 0x1e416547f14b5c51, 0x2b2a91357b1c86f8, 0xd444df2a355cbe46, // [2][0x20] + 0x41fd79d06fb333aa, 0xbe9337cf21f30b14, 0x8bf8c3bdaba4d1bd, 0x74968da2e5e4e903, // [2][0x24] + 0x9452a81a46ec590e, 0x6b3ce60508ac61b0, 0x5e57127782fbbb19, 0xa1395c68ccbb83a7, // [2][0x28] + 0x3480fa9296540e4b, 0xcbeeb48dd81436f5, 0xfe8540ff5243ec5c, 0x01eb0ee01c03d4e2, // [2][0x2c] + 0x0bd42ddd4cc51f2d, 0xf4ba63c202852793, 0xc1d197b088d2fd3a, 0x3ebfd9afc692c584, // [2][0x30] + 0xab067f559c7d4868, 0x5468314ad23d70d6, 0x6103c538586aaa7f, 0x9e6d8b27162a92c1, // [2][0x34] + 0x7ea9ae9fb52222cc, 0x81c7e080fb621a72, 0xb4ac14f27135c0db, 0x4bc25aed3f75f865, // [2][0x38] + 0xde7bfc17659a7589, 0x2115b2082bda4d37, 0x147e467aa18d979e, 0xeb100865efcdaf20, // [2][0x3c] + 0xf68770e226815ab5, 0x09e93efd68c1620b, 0x3c82ca8fe296b8a2, 0xc3ec8490acd6801c, // [2][0x40] + 0x5655226af6390df0, 0xa93b6c75b879354e, 0x9c509807322eefe7, 0x633ed6187c6ed759, // [2][0x44] + 0x83faf3a0df666754, 0x7c94bdbf91265fea, 0x49ff49cd1b718543, 0xb69107d25531bdfd, // [2][0x48] + 0x2328a1280fde3011, 0xdc46ef37419e08af, 0xe92d1b45cbc9d206, 0x1643555a8589eab8, // [2][0x4c] + 0x1c7c7667d54f2177, 0xe31238789b0f19c9, 0xd679cc0a1158c360, 0x291782155f18fbde, // [2][0x50] + 0xbcae24ef05f77632, 0x43c06af04bb74e8c, 0x76ab9e82c1e09425, 0x89c5d09d8fa0ac9b, // [2][0x54] + 0x6901f5252ca81c96, 0x966fbb3a62e82428, 0xa3044f48e8bffe81, 0x5c6a0157a6ffc63f, // [2][0x58] + 0xc9d3a7adfc104bd3, 0x36bde9b2b250736d, 0x03d61dc03807a9c4, 0xfcb853df7647917a, // [2][0x5c] + 0x17a85bba998a3e5a, 0xe8c615a5d7ca06e4, 0xddade1d75d9ddc4d, 0x22c3afc813dde4f3, // [2][0x60] + 0xb77a09324932691f, 0x4814472d077251a1, 0x7d7fb35f8d258b08, 0x8211fd40c365b3b6, // [2][0x64] + 0x62d5d8f8606d03bb, 0x9dbb96e72e2d3b05, 0xa8d06295a47ae1ac, 0x57be2c8aea3ad912, // [2][0x68] + 0xc2078a70b0d554fe, 0x3d69c46ffe956c40, 0x0802301d74c2b6e9, 0xf76c7e023a828e57, // [2][0x6c] + 0xfd535d3f6a444598, 0x023d132024047d26, 0x3756e752ae53a78f, 0xc838a94de0139f31, // [2][0x70] + 0x5d810fb7bafc12dd, 0xa2ef41a8f4bc2a63, 0x9784b5da7eebf0ca, 0x68eafbc530abc874, // [2][0x74] + 0x882ede7d93a37879, 0x77409062dde340c7, 0x422b641057b49a6e, 0xbd452a0f19f4a2d0, // [2][0x78] + 0x28fc8cf5431b2f3c, 0xd792c2ea0d5b1782, 0xe2f93698870ccd2b, 0x1d977887c94cf595, // [2][0x7c] + 0xd9d7c79715952601, 0x26b989885bd51ebf, 0x13d27dfad182c416, 0xecbc33e59fc2fca8, // [2][0x80] + 0x7905951fc52d7144, 0x866bdb008b6d49fa, 0xb3002f72013a9353, 0x4c6e616d4f7aabed, // [2][0x84] + 0xacaa44d5ec721be0, 0x53c40acaa232235e, 0x66affeb82865f9f7, 0x99c1b0a76625c149, // [2][0x88] + 0x0c78165d3cca4ca5, 0xf3165842728a741b, 0xc67dac30f8ddaeb2, 0x3913e22fb69d960c, // [2][0x8c] + 0x332cc112e65b5dc3, 0xcc428f0da81b657d, 0xf9297b7f224cbfd4, 0x064735606c0c876a, // [2][0x90] + 0x93fe939a36e30a86, 0x6c90dd8578a33238, 0x59fb29f7f2f4e891, 0xa69567e8bcb4d02f, // [2][0x94] + 0x465142501fbc6022, 0xb93f0c4f51fc589c, 0x8c54f83ddbab8235, 0x733ab62295ebba8b, // [2][0x98] + 0xe68310d8cf043767, 0x19ed5ec781440fd9, 0x2c86aab50b13d570, 0xd3e8e4aa4553edce, // [2][0x9c] + 0x38f8eccfaa9e42ee, 0xc796a2d0e4de7a50, 0xf2fd56a26e89a0f9, 0x0d9318bd20c99847, // [2][0xa0] + 0x982abe477a2615ab, 0x6744f05834662d15, 0x522f042abe31f7bc, 0xad414a35f071cf02, // [2][0xa4] + 0x4d856f8d53797f0f, 0xb2eb21921d3947b1, 0x8780d5e0976e9d18, 0x78ee9bffd92ea5a6, // [2][0xa8] + 0xed573d0583c1284a, 0x1239731acd8110f4, 0x2752876847d6ca5d, 0xd83cc9770996f2e3, // [2][0xac] + 0xd203ea4a5950392c, 0x2d6da45517100192, 0x180650279d47db3b, 0xe7681e38d307e385, // [2][0xb0] + 0x72d1b8c289e86e69, 0x8dbff6ddc7a856d7, 0xb8d402af4dff8c7e, 0x47ba4cb003bfb4c0, // [2][0xb4] + 0xa77e6908a0b704cd, 0x58102717eef73c73, 0x6d7bd36564a0e6da, 0x92159d7a2ae0de64, // [2][0xb8] + 0x07ac3b80700f5388, 0xf8c2759f3e4f6b36, 0xcda981edb418b19f, 0x32c7cff2fa588921, // [2][0xbc] + 0x2f50b77533147cb4, 0xd03ef96a7d54440a, 0xe5550d18f7039ea3, 0x1a3b4307b943a61d, // [2][0xc0] + 0x8f82e5fde3ac2bf1, 0x70ecabe2adec134f, 0x45875f9027bbc9e6, 0xbae9118f69fbf158, // [2][0xc4] + 0x5a2d3437caf34155, 0xa5437a2884b379eb, 0x90288e5a0ee4a342, 0x6f46c04540a49bfc, // [2][0xc8] + 0xfaff66bf1a4b1610, 0x059128a0540b2eae, 0x30fadcd2de5cf407, 0xcf9492cd901cccb9, // [2][0xcc] + 0xc5abb1f0c0da0776, 0x3ac5ffef8e9a3fc8, 0x0fae0b9d04cde561, 0xf0c045824a8ddddf, // [2][0xd0] + 0x6579e37810625033, 0x9a17ad675e22688d, 0xaf7c5915d475b224, 0x5012170a9a358a9a, // [2][0xd4] + 0xb0d632b2393d3a97, 0x4fb87cad777d0229, 0x7ad388dffd2ad880, 0x85bdc6c0b36ae03e, // [2][0xd8] + 0x1004603ae9856dd2, 0xef6a2e25a7c5556c, 0xda01da572d928fc5, 0x256f944863d2b77b, // [2][0xdc] + 0xce7f9c2d8c1f185b, 0x3111d232c25f20e5, 0x047a26404808fa4c, 0xfb14685f0648c2f2, // [2][0xe0] + 0x6eadcea55ca74f1e, 0x91c380ba12e777a0, 0xa4a874c898b0ad09, 0x5bc63ad7d6f095b7, // [2][0xe4] + 0xbb021f6f75f825ba, 0x446c51703bb81d04, 0x7107a502b1efc7ad, 0x8e69eb1dffafff13, // [2][0xe8] + 0x1bd04de7a54072ff, 0xe4be03f8eb004a41, 0xd1d5f78a615790e8, 0x2ebbb9952f17a856, // [2][0xec] + 0x24849aa87fd16399, 0xdbead4b731915b27, 0xee8120c5bbc6818e, 0x11ef6edaf586b930, // [2][0xf0] + 0x8456c820af6934dc, 0x7b38863fe1290c62, 0x4e53724d6b7ed6cb, 0xb13d3c52253eee75, // [2][0xf4] + 0x51f919ea86365e78, 0xae9757f5c87666c6, 0x9bfca3874221bc6f, 0x6492ed980c6184d1, // [2][0xf8] + 0xf12b4b62568e093d, 0x0e45057d18ce3183, 0x3b2ef10f9299eb2a, 0xc440bf10dcd9d394 // [2][0xfc] + }, + { + 0x0000000000000000, 0x8211147cbaf96306, 0x30fb0eaa2d655567, 0xb2ea1ad6979c3661, // [3][0x00] + 0x61f61d545acaaace, 0xe3e70928e033c9c8, 0x510d13fe77afffa9, 0xd31c0782cd569caf, // [3][0x04] + 0xc3ec3aa8b595559c, 0x41fd2ed40f6c369a, 0xf317340298f000fb, 0x7106207e220963fd, // [3][0x08] + 0xa21a27fcef5fff52, 0x200b338055a69c54, 0x92e12956c23aaa35, 0x10f03d2a78c3c933, // [3][0x0c] + 0xb301530233bd3853, 0x3110477e89445b55, 0x83fa5da81ed86d34, 0x01eb49d4a4210e32, // [3][0x10] + 0xd2f74e566977929d, 0x50e65a2ad38ef19b, 0xe20c40fc4412c7fa, 0x601d5480feeba4fc, // [3][0x14] + 0x70ed69aa86286dcf, 0xf2fc7dd63cd10ec9, 0x40166700ab4d38a8, 0xc207737c11b45bae, // [3][0x18] + 0x111b74fedce2c701, 0x930a6082661ba407, 0x21e07a54f1879266, 0xa3f16e284b7ef160, // [3][0x1c] + 0x52db80573fede3cd, 0xd0ca942b851480cb, 0x62208efd1288b6aa, 0xe0319a81a871d5ac, // [3][0x20] + 0x332d9d0365274903, 0xb13c897fdfde2a05, 0x03d693a948421c64, 0x81c787d5f2bb7f62, // [3][0x24] + 0x9137baff8a78b651, 0x1326ae833081d557, 0xa1ccb455a71de336, 0x23dda0291de48030, // [3][0x28] + 0xf0c1a7abd0b21c9f, 0x72d0b3d76a4b7f99, 0xc03aa901fdd749f8, 0x422bbd7d472e2afe, // [3][0x2c] + 0xe1dad3550c50db9e, 0x63cbc729b6a9b898, 0xd121ddff21358ef9, 0x5330c9839bccedff, // [3][0x30] + 0x802cce01569a7150, 0x023dda7dec631256, 0xb0d7c0ab7bff2437, 0x32c6d4d7c1064731, // [3][0x34] + 0x2236e9fdb9c58e02, 0xa027fd81033ced04, 0x12cde75794a0db65, 0x90dcf32b2e59b863, // [3][0x38] + 0x43c0f4a9e30f24cc, 0xc1d1e0d559f647ca, 0x733bfa03ce6a71ab, 0xf12aee7f749312ad, // [3][0x3c] + 0xa5b700ae7fdbc79a, 0x27a614d2c522a49c, 0x954c0e0452be92fd, 0x175d1a78e847f1fb, // [3][0x40] + 0xc4411dfa25116d54, 0x465009869fe80e52, 0xf4ba135008743833, 0x76ab072cb28d5b35, // [3][0x44] + 0x665b3a06ca4e9206, 0xe44a2e7a70b7f100, 0x56a034ace72bc761, 0xd4b120d05dd2a467, // [3][0x48] + 0x07ad2752908438c8, 0x85bc332e2a7d5bce, 0x375629f8bde16daf, 0xb5473d8407180ea9, // [3][0x4c] + 0x16b653ac4c66ffc9, 0x94a747d0f69f9ccf, 0x264d5d066103aaae, 0xa45c497adbfac9a8, // [3][0x50] + 0x77404ef816ac5507, 0xf5515a84ac553601, 0x47bb40523bc90060, 0xc5aa542e81306366, // [3][0x54] + 0xd55a6904f9f3aa55, 0x574b7d78430ac953, 0xe5a167aed496ff32, 0x67b073d26e6f9c34, // [3][0x58] + 0xb4ac7450a339009b, 0x36bd602c19c0639d, 0x84577afa8e5c55fc, 0x06466e8634a536fa, // [3][0x5c] + 0xf76c80f940362457, 0x757d9485facf4751, 0xc7978e536d537130, 0x45869a2fd7aa1236, // [3][0x60] + 0x969a9dad1afc8e99, 0x148b89d1a005ed9f, 0xa66193073799dbfe, 0x2470877b8d60b8f8, // [3][0x64] + 0x3480ba51f5a371cb, 0xb691ae2d4f5a12cd, 0x047bb4fbd8c624ac, 0x866aa087623f47aa, // [3][0x68] + 0x5576a705af69db05, 0xd767b3791590b803, 0x658da9af820c8e62, 0xe79cbdd338f5ed64, // [3][0x6c] + 0x446dd3fb738b1c04, 0xc67cc787c9727f02, 0x7496dd515eee4963, 0xf687c92de4172a65, // [3][0x70] + 0x259bceaf2941b6ca, 0xa78adad393b8d5cc, 0x1560c0050424e3ad, 0x9771d479bedd80ab, // [3][0x74] + 0x8781e953c61e4998, 0x0590fd2f7ce72a9e, 0xb77ae7f9eb7b1cff, 0x356bf38551827ff9, // [3][0x78] + 0xe677f4079cd4e356, 0x6466e07b262d8050, 0xd68cfaadb1b1b631, 0x549deed10b48d537, // [3][0x7c] + 0x7fb7270fa7201c5f, 0xfda633731dd97f59, 0x4f4c29a58a454938, 0xcd5d3dd930bc2a3e, // [3][0x80] + 0x1e413a5bfdeab691, 0x9c502e274713d597, 0x2eba34f1d08fe3f6, 0xacab208d6a7680f0, // [3][0x84] + 0xbc5b1da712b549c3, 0x3e4a09dba84c2ac5, 0x8ca0130d3fd01ca4, 0x0eb1077185297fa2, // [3][0x88] + 0xddad00f3487fe30d, 0x5fbc148ff286800b, 0xed560e59651ab66a, 0x6f471a25dfe3d56c, // [3][0x8c] + 0xccb6740d949d240c, 0x4ea760712e64470a, 0xfc4d7aa7b9f8716b, 0x7e5c6edb0301126d, // [3][0x90] + 0xad406959ce578ec2, 0x2f517d2574aeedc4, 0x9dbb67f3e332dba5, 0x1faa738f59cbb8a3, // [3][0x94] + 0x0f5a4ea521087190, 0x8d4b5ad99bf11296, 0x3fa1400f0c6d24f7, 0xbdb05473b69447f1, // [3][0x98] + 0x6eac53f17bc2db5e, 0xecbd478dc13bb858, 0x5e575d5b56a78e39, 0xdc464927ec5eed3f, // [3][0x9c] + 0x2d6ca75898cdff92, 0xaf7db32422349c94, 0x1d97a9f2b5a8aaf5, 0x9f86bd8e0f51c9f3, // [3][0xa0] + 0x4c9aba0cc207555c, 0xce8bae7078fe365a, 0x7c61b4a6ef62003b, 0xfe70a0da559b633d, // [3][0xa4] + 0xee809df02d58aa0e, 0x6c91898c97a1c908, 0xde7b935a003dff69, 0x5c6a8726bac49c6f, // [3][0xa8] + 0x8f7680a4779200c0, 0x0d6794d8cd6b63c6, 0xbf8d8e0e5af755a7, 0x3d9c9a72e00e36a1, // [3][0xac] + 0x9e6df45aab70c7c1, 0x1c7ce0261189a4c7, 0xae96faf0861592a6, 0x2c87ee8c3cecf1a0, // [3][0xb0] + 0xff9be90ef1ba6d0f, 0x7d8afd724b430e09, 0xcf60e7a4dcdf3868, 0x4d71f3d866265b6e, // [3][0xb4] + 0x5d81cef21ee5925d, 0xdf90da8ea41cf15b, 0x6d7ac0583380c73a, 0xef6bd4248979a43c, // [3][0xb8] + 0x3c77d3a6442f3893, 0xbe66c7dafed65b95, 0x0c8cdd0c694a6df4, 0x8e9dc970d3b30ef2, // [3][0xbc] + 0xda0027a1d8fbdbc5, 0x581133dd6202b8c3, 0xeafb290bf59e8ea2, 0x68ea3d774f67eda4, // [3][0xc0] + 0xbbf63af58231710b, 0x39e72e8938c8120d, 0x8b0d345faf54246c, 0x091c202315ad476a, // [3][0xc4] + 0x19ec1d096d6e8e59, 0x9bfd0975d797ed5f, 0x291713a3400bdb3e, 0xab0607dffaf2b838, // [3][0xc8] + 0x781a005d37a42497, 0xfa0b14218d5d4791, 0x48e10ef71ac171f0, 0xcaf01a8ba03812f6, // [3][0xcc] + 0x690174a3eb46e396, 0xeb1060df51bf8090, 0x59fa7a09c623b6f1, 0xdbeb6e757cdad5f7, // [3][0xd0] + 0x08f769f7b18c4958, 0x8ae67d8b0b752a5e, 0x380c675d9ce91c3f, 0xba1d732126107f39, // [3][0xd4] + 0xaaed4e0b5ed3b60a, 0x28fc5a77e42ad50c, 0x9a1640a173b6e36d, 0x180754ddc94f806b, // [3][0xd8] + 0xcb1b535f04191cc4, 0x490a4723bee07fc2, 0xfbe05df5297c49a3, 0x79f1498993852aa5, // [3][0xdc] + 0x88dba7f6e7163808, 0x0acab38a5def5b0e, 0xb820a95cca736d6f, 0x3a31bd20708a0e69, // [3][0xe0] + 0xe92dbaa2bddc92c6, 0x6b3caede0725f1c0, 0xd9d6b40890b9c7a1, 0x5bc7a0742a40a4a7, // [3][0xe4] + 0x4b379d5e52836d94, 0xc9268922e87a0e92, 0x7bcc93f47fe638f3, 0xf9dd8788c51f5bf5, // [3][0xe8] + 0x2ac1800a0849c75a, 0xa8d09476b2b0a45c, 0x1a3a8ea0252c923d, 0x982b9adc9fd5f13b, // [3][0xec] + 0x3bdaf4f4d4ab005b, 0xb9cbe0886e52635d, 0x0b21fa5ef9ce553c, 0x8930ee224337363a, // [3][0xf0] + 0x5a2ce9a08e61aa95, 0xd83dfddc3498c993, 0x6ad7e70aa304fff2, 0xe8c6f37619fd9cf4, // [3][0xf4] + 0xf836ce5c613e55c7, 0x7a27da20dbc736c1, 0xc8cdc0f64c5b00a0, 0x4adcd48af6a263a6, // [3][0xf8] + 0x99c0d3083bf4ff09, 0x1bd1c774810d9c0f, 0xa93bdda21691aa6e, 0x2b2ac9deac68c968 // [3][0xfc] + }, + { + 0x0000000000000000, 0x373d15f784905d1e, 0x6e7a2bef0920ba3c, 0x59473e188db0e722, // [4][0x00] + 0xdcf457de12417478, 0xebc9422996d12966, 0xb28e7c311b61ce44, 0x85b369c69ff1935a, // [4][0x04] + 0x8d3189ef7c157b9b, 0xba0c9c18f8852685, 0xe34ba2007535c1a7, 0xd476b7f7f1a59cb9, // [4][0x08] + 0x51c5de316e540fe3, 0x66f8cbc6eac452fd, 0x3fbff5de6774b5df, 0x0882e029e3e4e8c1, // [4][0x0c] + 0x2eba358da0bd645d, 0x1987207a242d3943, 0x40c01e62a99dde61, 0x77fd0b952d0d837f, // [4][0x10] + 0xf24e6253b2fc1025, 0xc57377a4366c4d3b, 0x9c3449bcbbdcaa19, 0xab095c4b3f4cf707, // [4][0x14] + 0xa38bbc62dca81fc6, 0x94b6a995583842d8, 0xcdf1978dd588a5fa, 0xfacc827a5118f8e4, // [4][0x18] + 0x7f7febbccee96bbe, 0x4842fe4b4a7936a0, 0x1105c053c7c9d182, 0x2638d5a443598c9c, // [4][0x1c] + 0x5d746b1b417ac8ba, 0x6a497eecc5ea95a4, 0x330e40f4485a7286, 0x04335503ccca2f98, // [4][0x20] + 0x81803cc5533bbcc2, 0xb6bd2932d7abe1dc, 0xeffa172a5a1b06fe, 0xd8c702ddde8b5be0, // [4][0x24] + 0xd045e2f43d6fb321, 0xe778f703b9ffee3f, 0xbe3fc91b344f091d, 0x8902dcecb0df5403, // [4][0x28] + 0x0cb1b52a2f2ec759, 0x3b8ca0ddabbe9a47, 0x62cb9ec5260e7d65, 0x55f68b32a29e207b, // [4][0x2c] + 0x73ce5e96e1c7ace7, 0x44f34b616557f1f9, 0x1db47579e8e716db, 0x2a89608e6c774bc5, // [4][0x30] + 0xaf3a0948f386d89f, 0x98071cbf77168581, 0xc14022a7faa662a3, 0xf67d37507e363fbd, // [4][0x34] + 0xfeffd7799dd2d77c, 0xc9c2c28e19428a62, 0x9085fc9694f26d40, 0xa7b8e9611062305e, // [4][0x38] + 0x220b80a78f93a304, 0x153695500b03fe1a, 0x4c71ab4886b31938, 0x7b4cbebf02234426, // [4][0x3c] + 0xbae8d63682f59174, 0x8dd5c3c10665cc6a, 0xd492fdd98bd52b48, 0xe3afe82e0f457656, // [4][0x40] + 0x661c81e890b4e50c, 0x5121941f1424b812, 0x0866aa0799945f30, 0x3f5bbff01d04022e, // [4][0x44] + 0x37d95fd9fee0eaef, 0x00e44a2e7a70b7f1, 0x59a37436f7c050d3, 0x6e9e61c173500dcd, // [4][0x48] + 0xeb2d0807eca19e97, 0xdc101df06831c389, 0x855723e8e58124ab, 0xb26a361f611179b5, // [4][0x4c] + 0x9452e3bb2248f529, 0xa36ff64ca6d8a837, 0xfa28c8542b684f15, 0xcd15dda3aff8120b, // [4][0x50] + 0x48a6b46530098151, 0x7f9ba192b499dc4f, 0x26dc9f8a39293b6d, 0x11e18a7dbdb96673, // [4][0x54] + 0x19636a545e5d8eb2, 0x2e5e7fa3dacdd3ac, 0x771941bb577d348e, 0x4024544cd3ed6990, // [4][0x58] + 0xc5973d8a4c1cfaca, 0xf2aa287dc88ca7d4, 0xabed1665453c40f6, 0x9cd00392c1ac1de8, // [4][0x5c] + 0xe79cbd2dc38f59ce, 0xd0a1a8da471f04d0, 0x89e696c2caafe3f2, 0xbedb83354e3fbeec, // [4][0x60] + 0x3b68eaf3d1ce2db6, 0x0c55ff04555e70a8, 0x5512c11cd8ee978a, 0x622fd4eb5c7eca94, // [4][0x64] + 0x6aad34c2bf9a2255, 0x5d9021353b0a7f4b, 0x04d71f2db6ba9869, 0x33ea0ada322ac577, // [4][0x68] + 0xb659631caddb562d, 0x816476eb294b0b33, 0xd82348f3a4fbec11, 0xef1e5d04206bb10f, // [4][0x6c] + 0xc92688a063323d93, 0xfe1b9d57e7a2608d, 0xa75ca34f6a1287af, 0x9061b6b8ee82dab1, // [4][0x70] + 0x15d2df7e717349eb, 0x22efca89f5e314f5, 0x7ba8f4917853f3d7, 0x4c95e166fcc3aec9, // [4][0x74] + 0x4417014f1f274608, 0x732a14b89bb71b16, 0x2a6d2aa01607fc34, 0x1d503f579297a12a, // [4][0x78] + 0x98e356910d663270, 0xafde436689f66f6e, 0xf6997d7e0446884c, 0xc1a4688980d6d552, // [4][0x7c] + 0x41088a3e5d7cb183, 0x76359fc9d9ecec9d, 0x2f72a1d1545c0bbf, 0x184fb426d0cc56a1, // [4][0x80] + 0x9dfcdde04f3dc5fb, 0xaac1c817cbad98e5, 0xf386f60f461d7fc7, 0xc4bbe3f8c28d22d9, // [4][0x84] + 0xcc3903d12169ca18, 0xfb041626a5f99706, 0xa243283e28497024, 0x957e3dc9acd92d3a, // [4][0x88] + 0x10cd540f3328be60, 0x27f041f8b7b8e37e, 0x7eb77fe03a08045c, 0x498a6a17be985942, // [4][0x8c] + 0x6fb2bfb3fdc1d5de, 0x588faa44795188c0, 0x01c8945cf4e16fe2, 0x36f581ab707132fc, // [4][0x90] + 0xb346e86def80a1a6, 0x847bfd9a6b10fcb8, 0xdd3cc382e6a01b9a, 0xea01d67562304684, // [4][0x94] + 0xe283365c81d4ae45, 0xd5be23ab0544f35b, 0x8cf91db388f41479, 0xbbc408440c644967, // [4][0x98] + 0x3e7761829395da3d, 0x094a747517058723, 0x500d4a6d9ab56001, 0x67305f9a1e253d1f, // [4][0x9c] + 0x1c7ce1251c067939, 0x2b41f4d298962427, 0x7206caca1526c305, 0x453bdf3d91b69e1b, // [4][0xa0] + 0xc088b6fb0e470d41, 0xf7b5a30c8ad7505f, 0xaef29d140767b77d, 0x99cf88e383f7ea63, // [4][0xa4] + 0x914d68ca601302a2, 0xa6707d3de4835fbc, 0xff3743256933b89e, 0xc80a56d2eda3e580, // [4][0xa8] + 0x4db93f14725276da, 0x7a842ae3f6c22bc4, 0x23c314fb7b72cce6, 0x14fe010cffe291f8, // [4][0xac] + 0x32c6d4a8bcbb1d64, 0x05fbc15f382b407a, 0x5cbcff47b59ba758, 0x6b81eab0310bfa46, // [4][0xb0] + 0xee328376aefa691c, 0xd90f96812a6a3402, 0x8048a899a7dad320, 0xb775bd6e234a8e3e, // [4][0xb4] + 0xbff75d47c0ae66ff, 0x88ca48b0443e3be1, 0xd18d76a8c98edcc3, 0xe6b0635f4d1e81dd, // [4][0xb8] + 0x63030a99d2ef1287, 0x543e1f6e567f4f99, 0x0d792176dbcfa8bb, 0x3a4434815f5ff5a5, // [4][0xbc] + 0xfbe05c08df8920f7, 0xccdd49ff5b197de9, 0x959a77e7d6a99acb, 0xa2a762105239c7d5, // [4][0xc0] + 0x27140bd6cdc8548f, 0x10291e2149580991, 0x496e2039c4e8eeb3, 0x7e5335ce4078b3ad, // [4][0xc4] + 0x76d1d5e7a39c5b6c, 0x41ecc010270c0672, 0x18abfe08aabce150, 0x2f96ebff2e2cbc4e, // [4][0xc8] + 0xaa258239b1dd2f14, 0x9d1897ce354d720a, 0xc45fa9d6b8fd9528, 0xf362bc213c6dc836, // [4][0xcc] + 0xd55a69857f3444aa, 0xe2677c72fba419b4, 0xbb20426a7614fe96, 0x8c1d579df284a388, // [4][0xd0] + 0x09ae3e5b6d7530d2, 0x3e932bace9e56dcc, 0x67d415b464558aee, 0x50e90043e0c5d7f0, // [4][0xd4] + 0x586be06a03213f31, 0x6f56f59d87b1622f, 0x3611cb850a01850d, 0x012cde728e91d813, // [4][0xd8] + 0x849fb7b411604b49, 0xb3a2a24395f01657, 0xeae59c5b1840f175, 0xddd889ac9cd0ac6b, // [4][0xdc] + 0xa69437139ef3e84d, 0x91a922e41a63b553, 0xc8ee1cfc97d35271, 0xffd3090b13430f6f, // [4][0xe0] + 0x7a6060cd8cb29c35, 0x4d5d753a0822c12b, 0x141a4b2285922609, 0x23275ed501027b17, // [4][0xe4] + 0x2ba5befce2e693d6, 0x1c98ab0b6676cec8, 0x45df9513ebc629ea, 0x72e280e46f5674f4, // [4][0xe8] + 0xf751e922f0a7e7ae, 0xc06cfcd57437bab0, 0x992bc2cdf9875d92, 0xae16d73a7d17008c, // [4][0xec] + 0x882e029e3e4e8c10, 0xbf131769baded10e, 0xe6542971376e362c, 0xd1693c86b3fe6b32, // [4][0xf0] + 0x54da55402c0ff868, 0x63e740b7a89fa576, 0x3aa07eaf252f4254, 0x0d9d6b58a1bf1f4a, // [4][0xf4] + 0x051f8b71425bf78b, 0x32229e86c6cbaa95, 0x6b65a09e4b7b4db7, 0x5c58b569cfeb10a9, // [4][0xf8] + 0xd9ebdcaf501a83f3, 0xeed6c958d48adeed, 0xb791f740593a39cf, 0x80ace2b7ddaa64d1 // [4][0xfc] + }, + { + 0x0000000000000000, 0xe9742a79ef04a5d4, 0xe63172a0869ed8c3, 0x0f4558d9699a7d17, // [5][0x00] + 0xf8bbc31255aa22ed, 0x11cfe96bbaae8739, 0x1e8ab1b2d334fa2e, 0xf7fe9bcb3c305ffa, // [5][0x04] + 0xc5aea077f3c3d6b1, 0x2cda8a0e1cc77365, 0x239fd2d7755d0e72, 0xcaebf8ae9a59aba6, // [5][0x08] + 0x3d156365a669f45c, 0xd461491c496d5188, 0xdb2411c520f72c9f, 0x32503bbccff3894b, // [5][0x0c] + 0xbf8466bcbf103e09, 0x56f04cc550149bdd, 0x59b5141c398ee6ca, 0xb0c13e65d68a431e, // [5][0x10] + 0x473fa5aeeaba1ce4, 0xae4b8fd705beb930, 0xa10ed70e6c24c427, 0x487afd77832061f3, // [5][0x14] + 0x7a2ac6cb4cd3e8b8, 0x935eecb2a3d74d6c, 0x9c1bb46bca4d307b, 0x756f9e12254995af, // [5][0x18] + 0x829105d91979ca55, 0x6be52fa0f67d6f81, 0x64a077799fe71296, 0x8dd45d0070e3b742, // [5][0x1c] + 0x4bd1eb2a26b7ef79, 0xa2a5c153c9b34aad, 0xade0998aa02937ba, 0x4494b3f34f2d926e, // [5][0x20] + 0xb36a2838731dcd94, 0x5a1e02419c196840, 0x555b5a98f5831557, 0xbc2f70e11a87b083, // [5][0x24] + 0x8e7f4b5dd57439c8, 0x670b61243a709c1c, 0x684e39fd53eae10b, 0x813a1384bcee44df, // [5][0x28] + 0x76c4884f80de1b25, 0x9fb0a2366fdabef1, 0x90f5faef0640c3e6, 0x7981d096e9446632, // [5][0x2c] + 0xf4558d9699a7d170, 0x1d21a7ef76a374a4, 0x1264ff361f3909b3, 0xfb10d54ff03dac67, // [5][0x30] + 0x0cee4e84cc0df39d, 0xe59a64fd23095649, 0xeadf3c244a932b5e, 0x03ab165da5978e8a, // [5][0x34] + 0x31fb2de16a6407c1, 0xd88f07988560a215, 0xd7ca5f41ecfadf02, 0x3ebe753803fe7ad6, // [5][0x38] + 0xc940eef33fce252c, 0x2034c48ad0ca80f8, 0x2f719c53b950fdef, 0xc605b62a5654583b, // [5][0x3c] + 0x97a3d6544d6fdef2, 0x7ed7fc2da26b7b26, 0x7192a4f4cbf10631, 0x98e68e8d24f5a3e5, // [5][0x40] + 0x6f18154618c5fc1f, 0x866c3f3ff7c159cb, 0x892967e69e5b24dc, 0x605d4d9f715f8108, // [5][0x44] + 0x520d7623beac0843, 0xbb795c5a51a8ad97, 0xb43c04833832d080, 0x5d482efad7367554, // [5][0x48] + 0xaab6b531eb062aae, 0x43c29f4804028f7a, 0x4c87c7916d98f26d, 0xa5f3ede8829c57b9, // [5][0x4c] + 0x2827b0e8f27fe0fb, 0xc1539a911d7b452f, 0xce16c24874e13838, 0x2762e8319be59dec, // [5][0x50] + 0xd09c73faa7d5c216, 0x39e8598348d167c2, 0x36ad015a214b1ad5, 0xdfd92b23ce4fbf01, // [5][0x54] + 0xed89109f01bc364a, 0x04fd3ae6eeb8939e, 0x0bb8623f8722ee89, 0xe2cc484668264b5d, // [5][0x58] + 0x1532d38d541614a7, 0xfc46f9f4bb12b173, 0xf303a12dd288cc64, 0x1a778b543d8c69b0, // [5][0x5c] + 0xdc723d7e6bd8318b, 0x3506170784dc945f, 0x3a434fdeed46e948, 0xd33765a702424c9c, // [5][0x60] + 0x24c9fe6c3e721366, 0xcdbdd415d176b6b2, 0xc2f88cccb8eccba5, 0x2b8ca6b557e86e71, // [5][0x64] + 0x19dc9d09981be73a, 0xf0a8b770771f42ee, 0xffedefa91e853ff9, 0x1699c5d0f1819a2d, // [5][0x68] + 0xe1675e1bcdb1c5d7, 0x0813746222b56003, 0x07562cbb4b2f1d14, 0xee2206c2a42bb8c0, // [5][0x6c] + 0x63f65bc2d4c80f82, 0x8a8271bb3bccaa56, 0x85c729625256d741, 0x6cb3031bbd527295, // [5][0x70] + 0x9b4d98d081622d6f, 0x7239b2a96e6688bb, 0x7d7cea7007fcf5ac, 0x9408c009e8f85078, // [5][0x74] + 0xa658fbb5270bd933, 0x4f2cd1ccc80f7ce7, 0x40698915a19501f0, 0xa91da36c4e91a424, // [5][0x78] + 0x5ee338a772a1fbde, 0xb79712de9da55e0a, 0xb8d24a07f43f231d, 0x51a6607e1b3b86c9, // [5][0x7c] + 0x1b9e8afbc2482e8f, 0xf2eaa0822d4c8b5b, 0xfdaff85b44d6f64c, 0x14dbd222abd25398, // [5][0x80] + 0xe32549e997e20c62, 0x0a51639078e6a9b6, 0x05143b49117cd4a1, 0xec601130fe787175, // [5][0x84] + 0xde302a8c318bf83e, 0x374400f5de8f5dea, 0x3801582cb71520fd, 0xd175725558118529, // [5][0x88] + 0x268be99e6421dad3, 0xcfffc3e78b257f07, 0xc0ba9b3ee2bf0210, 0x29ceb1470dbba7c4, // [5][0x8c] + 0xa41aec477d581086, 0x4d6ec63e925cb552, 0x422b9ee7fbc6c845, 0xab5fb49e14c26d91, // [5][0x90] + 0x5ca12f5528f2326b, 0xb5d5052cc7f697bf, 0xba905df5ae6ceaa8, 0x53e4778c41684f7c, // [5][0x94] + 0x61b44c308e9bc637, 0x88c06649619f63e3, 0x87853e9008051ef4, 0x6ef114e9e701bb20, // [5][0x98] + 0x990f8f22db31e4da, 0x707ba55b3435410e, 0x7f3efd825daf3c19, 0x964ad7fbb2ab99cd, // [5][0x9c] + 0x504f61d1e4ffc1f6, 0xb93b4ba80bfb6422, 0xb67e137162611935, 0x5f0a39088d65bce1, // [5][0xa0] + 0xa8f4a2c3b155e31b, 0x418088ba5e5146cf, 0x4ec5d06337cb3bd8, 0xa7b1fa1ad8cf9e0c, // [5][0xa4] + 0x95e1c1a6173c1747, 0x7c95ebdff838b293, 0x73d0b30691a2cf84, 0x9aa4997f7ea66a50, // [5][0xa8] + 0x6d5a02b4429635aa, 0x842e28cdad92907e, 0x8b6b7014c408ed69, 0x621f5a6d2b0c48bd, // [5][0xac] + 0xefcb076d5befffff, 0x06bf2d14b4eb5a2b, 0x09fa75cddd71273c, 0xe08e5fb4327582e8, // [5][0xb0] + 0x1770c47f0e45dd12, 0xfe04ee06e14178c6, 0xf141b6df88db05d1, 0x18359ca667dfa005, // [5][0xb4] + 0x2a65a71aa82c294e, 0xc3118d6347288c9a, 0xcc54d5ba2eb2f18d, 0x2520ffc3c1b65459, // [5][0xb8] + 0xd2de6408fd860ba3, 0x3baa4e711282ae77, 0x34ef16a87b18d360, 0xdd9b3cd1941c76b4, // [5][0xbc] + 0x8c3d5caf8f27f07d, 0x654976d6602355a9, 0x6a0c2e0f09b928be, 0x83780476e6bd8d6a, // [5][0xc0] + 0x74869fbdda8dd290, 0x9df2b5c435897744, 0x92b7ed1d5c130a53, 0x7bc3c764b317af87, // [5][0xc4] + 0x4993fcd87ce426cc, 0xa0e7d6a193e08318, 0xafa28e78fa7afe0f, 0x46d6a401157e5bdb, // [5][0xc8] + 0xb1283fca294e0421, 0x585c15b3c64aa1f5, 0x57194d6aafd0dce2, 0xbe6d671340d47936, // [5][0xcc] + 0x33b93a133037ce74, 0xdacd106adf336ba0, 0xd58848b3b6a916b7, 0x3cfc62ca59adb363, // [5][0xd0] + 0xcb02f901659dec99, 0x2276d3788a99494d, 0x2d338ba1e303345a, 0xc447a1d80c07918e, // [5][0xd4] + 0xf6179a64c3f418c5, 0x1f63b01d2cf0bd11, 0x1026e8c4456ac006, 0xf952c2bdaa6e65d2, // [5][0xd8] + 0x0eac5976965e3a28, 0xe7d8730f795a9ffc, 0xe89d2bd610c0e2eb, 0x01e901afffc4473f, // [5][0xdc] + 0xc7ecb785a9901f04, 0x2e989dfc4694bad0, 0x21ddc5252f0ec7c7, 0xc8a9ef5cc00a6213, // [5][0xe0] + 0x3f577497fc3a3de9, 0xd6235eee133e983d, 0xd96606377aa4e52a, 0x30122c4e95a040fe, // [5][0xe4] + 0x024217f25a53c9b5, 0xeb363d8bb5576c61, 0xe4736552dccd1176, 0x0d074f2b33c9b4a2, // [5][0xe8] + 0xfaf9d4e00ff9eb58, 0x138dfe99e0fd4e8c, 0x1cc8a6408967339b, 0xf5bc8c396663964f, // [5][0xec] + 0x7868d1391680210d, 0x911cfb40f98484d9, 0x9e59a399901ef9ce, 0x772d89e07f1a5c1a, // [5][0xf0] + 0x80d3122b432a03e0, 0x69a73852ac2ea634, 0x66e2608bc5b4db23, 0x8f964af22ab07ef7, // [5][0xf4] + 0xbdc6714ee543f7bc, 0x54b25b370a475268, 0x5bf703ee63dd2f7f, 0xb28329978cd98aab, // [5][0xf8] + 0x457db25cb0e9d551, 0xac0998255fed7085, 0xa34cc0fc36770d92, 0x4a38ea85d973a846 // [5][0xfc] + }, + { + 0x0000000000000000, 0xfc5d27f6bf353971, 0xcc6369be26fde189, 0x303e4e4899c8d8f8, // [6][0x00] + 0xac1ff52f156c5079, 0x5042d2d9aa596908, 0x607c9c913391b1f0, 0x9c21bb678ca48881, // [6][0x04] + 0x6ce6cc0d724f3399, 0x90bbebfbcd7a0ae8, 0xa085a5b354b2d210, 0x5cd88245eb87eb61, // [6][0x08] + 0xc0f93922672363e0, 0x3ca41ed4d8165a91, 0x0c9a509c41de8269, 0xf0c7776afeebbb18, // [6][0x0c] + 0xd9cd981ae49e6732, 0x2590bfec5bab5e43, 0x15aef1a4c26386bb, 0xe9f3d6527d56bfca, // [6][0x10] + 0x75d26d35f1f2374b, 0x898f4ac34ec70e3a, 0xb9b1048bd70fd6c2, 0x45ec237d683aefb3, // [6][0x14] + 0xb52b541796d154ab, 0x497673e129e46dda, 0x79483da9b02cb522, 0x85151a5f0f198c53, // [6][0x18] + 0x1934a13883bd04d2, 0xe56986ce3c883da3, 0xd557c886a540e55b, 0x290aef701a75dc2a, // [6][0x1c] + 0x8742166691ab5d0f, 0x7b1f31902e9e647e, 0x4b217fd8b756bc86, 0xb77c582e086385f7, // [6][0x20] + 0x2b5de34984c70d76, 0xd700c4bf3bf23407, 0xe73e8af7a23aecff, 0x1b63ad011d0fd58e, // [6][0x24] + 0xeba4da6be3e46e96, 0x17f9fd9d5cd157e7, 0x27c7b3d5c5198f1f, 0xdb9a94237a2cb66e, // [6][0x28] + 0x47bb2f44f6883eef, 0xbbe608b249bd079e, 0x8bd846fad075df66, 0x7785610c6f40e617, // [6][0x2c] + 0x5e8f8e7c75353a3d, 0xa2d2a98aca00034c, 0x92ece7c253c8dbb4, 0x6eb1c034ecfde2c5, // [6][0x30] + 0xf2907b5360596a44, 0x0ecd5ca5df6c5335, 0x3ef312ed46a48bcd, 0xc2ae351bf991b2bc, // [6][0x34] + 0x32694271077a09a4, 0xce346587b84f30d5, 0xfe0a2bcf2187e82d, 0x02570c399eb2d15c, // [6][0x38] + 0x9e76b75e121659dd, 0x622b90a8ad2360ac, 0x5215dee034ebb854, 0xae48f9168bde8125, // [6][0x3c] + 0x3a5d0a9e7bc12975, 0xc6002d68c4f41004, 0xf63e63205d3cc8fc, 0x0a6344d6e209f18d, // [6][0x40] + 0x9642ffb16ead790c, 0x6a1fd847d198407d, 0x5a21960f48509885, 0xa67cb1f9f765a1f4, // [6][0x44] + 0x56bbc693098e1aec, 0xaae6e165b6bb239d, 0x9ad8af2d2f73fb65, 0x668588db9046c214, // [6][0x48] + 0xfaa433bc1ce24a95, 0x06f9144aa3d773e4, 0x36c75a023a1fab1c, 0xca9a7df4852a926d, // [6][0x4c] + 0xe39092849f5f4e47, 0x1fcdb572206a7736, 0x2ff3fb3ab9a2afce, 0xd3aedccc069796bf, // [6][0x50] + 0x4f8f67ab8a331e3e, 0xb3d2405d3506274f, 0x83ec0e15acceffb7, 0x7fb129e313fbc6c6, // [6][0x54] + 0x8f765e89ed107dde, 0x732b797f522544af, 0x43153737cbed9c57, 0xbf4810c174d8a526, // [6][0x58] + 0x2369aba6f87c2da7, 0xdf348c50474914d6, 0xef0ac218de81cc2e, 0x1357e5ee61b4f55f, // [6][0x5c] + 0xbd1f1cf8ea6a747a, 0x41423b0e555f4d0b, 0x717c7546cc9795f3, 0x8d2152b073a2ac82, // [6][0x60] + 0x1100e9d7ff062403, 0xed5dce2140331d72, 0xdd638069d9fbc58a, 0x213ea79f66cefcfb, // [6][0x64] + 0xd1f9d0f5982547e3, 0x2da4f70327107e92, 0x1d9ab94bbed8a66a, 0xe1c79ebd01ed9f1b, // [6][0x68] + 0x7de625da8d49179a, 0x81bb022c327c2eeb, 0xb1854c64abb4f613, 0x4dd86b921481cf62, // [6][0x6c] + 0x64d284e20ef41348, 0x988fa314b1c12a39, 0xa8b1ed5c2809f2c1, 0x54eccaaa973ccbb0, // [6][0x70] + 0xc8cd71cd1b984331, 0x3490563ba4ad7a40, 0x04ae18733d65a2b8, 0xf8f33f8582509bc9, // [6][0x74] + 0x083448ef7cbb20d1, 0xf4696f19c38e19a0, 0xc45721515a46c158, 0x380a06a7e573f829, // [6][0x78] + 0xa42bbdc069d770a8, 0x58769a36d6e249d9, 0x6848d47e4f2a9121, 0x9415f388f01fa850, // [6][0x7c] + 0x74ba153cf78252ea, 0x88e732ca48b76b9b, 0xb8d97c82d17fb363, 0x44845b746e4a8a12, // [6][0x80] + 0xd8a5e013e2ee0293, 0x24f8c7e55ddb3be2, 0x14c689adc413e31a, 0xe89bae5b7b26da6b, // [6][0x84] + 0x185cd93185cd6173, 0xe401fec73af85802, 0xd43fb08fa33080fa, 0x286297791c05b98b, // [6][0x88] + 0xb4432c1e90a1310a, 0x481e0be82f94087b, 0x782045a0b65cd083, 0x847d62560969e9f2, // [6][0x8c] + 0xad778d26131c35d8, 0x512aaad0ac290ca9, 0x6114e49835e1d451, 0x9d49c36e8ad4ed20, // [6][0x90] + 0x01687809067065a1, 0xfd355fffb9455cd0, 0xcd0b11b7208d8428, 0x315636419fb8bd59, // [6][0x94] + 0xc191412b61530641, 0x3dcc66ddde663f30, 0x0df2289547aee7c8, 0xf1af0f63f89bdeb9, // [6][0x98] + 0x6d8eb404743f5638, 0x91d393f2cb0a6f49, 0xa1edddba52c2b7b1, 0x5db0fa4cedf78ec0, // [6][0x9c] + 0xf3f8035a66290fe5, 0x0fa524acd91c3694, 0x3f9b6ae440d4ee6c, 0xc3c64d12ffe1d71d, // [6][0xa0] + 0x5fe7f67573455f9c, 0xa3bad183cc7066ed, 0x93849fcb55b8be15, 0x6fd9b83dea8d8764, // [6][0xa4] + 0x9f1ecf5714663c7c, 0x6343e8a1ab53050d, 0x537da6e9329bddf5, 0xaf20811f8daee484, // [6][0xa8] + 0x33013a78010a6c05, 0xcf5c1d8ebe3f5574, 0xff6253c627f78d8c, 0x033f743098c2b4fd, // [6][0xac] + 0x2a359b4082b768d7, 0xd668bcb63d8251a6, 0xe656f2fea44a895e, 0x1a0bd5081b7fb02f, // [6][0xb0] + 0x862a6e6f97db38ae, 0x7a77499928ee01df, 0x4a4907d1b126d927, 0xb61420270e13e056, // [6][0xb4] + 0x46d3574df0f85b4e, 0xba8e70bb4fcd623f, 0x8ab03ef3d605bac7, 0x76ed1905693083b6, // [6][0xb8] + 0xeacca262e5940b37, 0x169185945aa13246, 0x26afcbdcc369eabe, 0xdaf2ec2a7c5cd3cf, // [6][0xbc] + 0x4ee71fa28c437b9f, 0xb2ba3854337642ee, 0x8284761caabe9a16, 0x7ed951ea158ba367, // [6][0xc0] + 0xe2f8ea8d992f2be6, 0x1ea5cd7b261a1297, 0x2e9b8333bfd2ca6f, 0xd2c6a4c500e7f31e, // [6][0xc4] + 0x2201d3affe0c4806, 0xde5cf45941397177, 0xee62ba11d8f1a98f, 0x123f9de767c490fe, // [6][0xc8] + 0x8e1e2680eb60187f, 0x724301765455210e, 0x427d4f3ecd9df9f6, 0xbe2068c872a8c087, // [6][0xcc] + 0x972a87b868dd1cad, 0x6b77a04ed7e825dc, 0x5b49ee064e20fd24, 0xa714c9f0f115c455, // [6][0xd0] + 0x3b3572977db14cd4, 0xc7685561c28475a5, 0xf7561b295b4cad5d, 0x0b0b3cdfe479942c, // [6][0xd4] + 0xfbcc4bb51a922f34, 0x07916c43a5a71645, 0x37af220b3c6fcebd, 0xcbf205fd835af7cc, // [6][0xd8] + 0x57d3be9a0ffe7f4d, 0xab8e996cb0cb463c, 0x9bb0d72429039ec4, 0x67edf0d29636a7b5, // [6][0xdc] + 0xc9a509c41de82690, 0x35f82e32a2dd1fe1, 0x05c6607a3b15c719, 0xf99b478c8420fe68, // [6][0xe0] + 0x65bafceb088476e9, 0x99e7db1db7b14f98, 0xa9d995552e799760, 0x5584b2a3914cae11, // [6][0xe4] + 0xa543c5c96fa71509, 0x591ee23fd0922c78, 0x6920ac77495af480, 0x957d8b81f66fcdf1, // [6][0xe8] + 0x095c30e67acb4570, 0xf5011710c5fe7c01, 0xc53f59585c36a4f9, 0x39627eaee3039d88, // [6][0xec] + 0x106891def97641a2, 0xec35b628464378d3, 0xdc0bf860df8ba02b, 0x2056df9660be995a, // [6][0xf0] + 0xbc7764f1ec1a11db, 0x402a4307532f28aa, 0x70140d4fcae7f052, 0x8c492ab975d2c923, // [6][0xf4] + 0x7c8e5dd38b39723b, 0x80d37a25340c4b4a, 0xb0ed346dadc493b2, 0x4cb0139b12f1aac3, // [6][0xf8] + 0xd091a8fc9e552242, 0x2ccc8f0a21601b33, 0x1cf2c142b8a8c3cb, 0xe0afe6b4079dfaba // [6][0xfc] + }, + { + 0x0000000000000000, 0x21e9761e252621ac, 0x43d2ec3c4a4c4358, 0x623b9a226f6a62f4, // [7][0x00] + 0x87a5d878949886b0, 0xa64cae66b1bea71c, 0xc4773444ded4c5e8, 0xe59e425afbf2e444, // [7][0x04] + 0x3b9296a271a69e0b, 0x1a7be0bc5480bfa7, 0x78407a9e3beadd53, 0x59a90c801eccfcff, // [7][0x08] + 0xbc374edae53e18bb, 0x9dde38c4c0183917, 0xffe5a2e6af725be3, 0xde0cd4f88a547a4f, // [7][0x0c] + 0x77252d44e34d3c16, 0x56cc5b5ac66b1dba, 0x34f7c178a9017f4e, 0x151eb7668c275ee2, // [7][0x10] + 0xf080f53c77d5baa6, 0xd169832252f39b0a, 0xb35219003d99f9fe, 0x92bb6f1e18bfd852, // [7][0x14] + 0x4cb7bbe692eba21d, 0x6d5ecdf8b7cd83b1, 0x0f6557dad8a7e145, 0x2e8c21c4fd81c0e9, // [7][0x18] + 0xcb12639e067324ad, 0xeafb158023550501, 0x88c08fa24c3f67f5, 0xa929f9bc69194659, // [7][0x1c] + 0xee4a5a89c69a782c, 0xcfa32c97e3bc5980, 0xad98b6b58cd63b74, 0x8c71c0aba9f01ad8, // [7][0x20] + 0x69ef82f15202fe9c, 0x4806f4ef7724df30, 0x2a3d6ecd184ebdc4, 0x0bd418d33d689c68, // [7][0x24] + 0xd5d8cc2bb73ce627, 0xf431ba35921ac78b, 0x960a2017fd70a57f, 0xb7e35609d85684d3, // [7][0x28] + 0x527d145323a46097, 0x7394624d0682413b, 0x11aff86f69e823cf, 0x30468e714cce0263, // [7][0x2c] + 0x996f77cd25d7443a, 0xb88601d300f16596, 0xdabd9bf16f9b0762, 0xfb54edef4abd26ce, // [7][0x30] + 0x1ecaafb5b14fc28a, 0x3f23d9ab9469e326, 0x5d184389fb0381d2, 0x7cf13597de25a07e, // [7][0x34] + 0xa2fde16f5471da31, 0x831497717157fb9d, 0xe12f0d531e3d9969, 0xc0c67b4d3b1bb8c5, // [7][0x38] + 0x25583917c0e95c81, 0x04b14f09e5cf7d2d, 0x668ad52b8aa51fd9, 0x4763a335af833e75, // [7][0x3c] + 0xe84d9340d5a36333, 0xc9a4e55ef085429f, 0xab9f7f7c9fef206b, 0x8a760962bac901c7, // [7][0x40] + 0x6fe84b38413be583, 0x4e013d26641dc42f, 0x2c3aa7040b77a6db, 0x0dd3d11a2e518777, // [7][0x44] + 0xd3df05e2a405fd38, 0xf23673fc8123dc94, 0x900de9deee49be60, 0xb1e49fc0cb6f9fcc, // [7][0x48] + 0x547add9a309d7b88, 0x7593ab8415bb5a24, 0x17a831a67ad138d0, 0x364147b85ff7197c, // [7][0x4c] + 0x9f68be0436ee5f25, 0xbe81c81a13c87e89, 0xdcba52387ca21c7d, 0xfd53242659843dd1, // [7][0x50] + 0x18cd667ca276d995, 0x392410628750f839, 0x5b1f8a40e83a9acd, 0x7af6fc5ecd1cbb61, // [7][0x54] + 0xa4fa28a64748c12e, 0x85135eb8626ee082, 0xe728c49a0d048276, 0xc6c1b2842822a3da, // [7][0x58] + 0x235ff0ded3d0479e, 0x02b686c0f6f66632, 0x608d1ce2999c04c6, 0x41646afcbcba256a, // [7][0x5c] + 0x0607c9c913391b1f, 0x27eebfd7361f3ab3, 0x45d525f559755847, 0x643c53eb7c5379eb, // [7][0x60] + 0x81a211b187a19daf, 0xa04b67afa287bc03, 0xc270fd8dcdeddef7, 0xe3998b93e8cbff5b, // [7][0x64] + 0x3d955f6b629f8514, 0x1c7c297547b9a4b8, 0x7e47b35728d3c64c, 0x5faec5490df5e7e0, // [7][0x68] + 0xba308713f60703a4, 0x9bd9f10dd3212208, 0xf9e26b2fbc4b40fc, 0xd80b1d31996d6150, // [7][0x6c] + 0x7122e48df0742709, 0x50cb9293d55206a5, 0x32f008b1ba386451, 0x13197eaf9f1e45fd, // [7][0x70] + 0xf6873cf564eca1b9, 0xd76e4aeb41ca8015, 0xb555d0c92ea0e2e1, 0x94bca6d70b86c34d, // [7][0x74] + 0x4ab0722f81d2b902, 0x6b590431a4f498ae, 0x09629e13cb9efa5a, 0x288be80deeb8dbf6, // [7][0x78] + 0xcd15aa57154a3fb2, 0xecfcdc49306c1e1e, 0x8ec7466b5f067cea, 0xaf2e30757a205d46, // [7][0x7c] + 0xe44200d2f3d1550d, 0xc5ab76ccd6f774a1, 0xa790eceeb99d1655, 0x86799af09cbb37f9, // [7][0x80] + 0x63e7d8aa6749d3bd, 0x420eaeb4426ff211, 0x203534962d0590e5, 0x01dc42880823b149, // [7][0x84] + 0xdfd096708277cb06, 0xfe39e06ea751eaaa, 0x9c027a4cc83b885e, 0xbdeb0c52ed1da9f2, // [7][0x88] + 0x58754e0816ef4db6, 0x799c381633c96c1a, 0x1ba7a2345ca30eee, 0x3a4ed42a79852f42, // [7][0x8c] + 0x93672d96109c691b, 0xb28e5b8835ba48b7, 0xd0b5c1aa5ad02a43, 0xf15cb7b47ff60bef, // [7][0x90] + 0x14c2f5ee8404efab, 0x352b83f0a122ce07, 0x571019d2ce48acf3, 0x76f96fcceb6e8d5f, // [7][0x94] + 0xa8f5bb34613af710, 0x891ccd2a441cd6bc, 0xeb2757082b76b448, 0xcace21160e5095e4, // [7][0x98] + 0x2f50634cf5a271a0, 0x0eb91552d084500c, 0x6c828f70bfee32f8, 0x4d6bf96e9ac81354, // [7][0x9c] + 0x0a085a5b354b2d21, 0x2be12c45106d0c8d, 0x49dab6677f076e79, 0x6833c0795a214fd5, // [7][0xa0] + 0x8dad8223a1d3ab91, 0xac44f43d84f58a3d, 0xce7f6e1feb9fe8c9, 0xef961801ceb9c965, // [7][0xa4] + 0x319accf944edb32a, 0x1073bae761cb9286, 0x724820c50ea1f072, 0x53a156db2b87d1de, // [7][0xa8] + 0xb63f1481d075359a, 0x97d6629ff5531436, 0xf5edf8bd9a3976c2, 0xd4048ea3bf1f576e, // [7][0xac] + 0x7d2d771fd6061137, 0x5cc40101f320309b, 0x3eff9b239c4a526f, 0x1f16ed3db96c73c3, // [7][0xb0] + 0xfa88af67429e9787, 0xdb61d97967b8b62b, 0xb95a435b08d2d4df, 0x98b335452df4f573, // [7][0xb4] + 0x46bfe1bda7a08f3c, 0x675697a38286ae90, 0x056d0d81edeccc64, 0x24847b9fc8caedc8, // [7][0xb8] + 0xc11a39c53338098c, 0xe0f34fdb161e2820, 0x82c8d5f979744ad4, 0xa321a3e75c526b78, // [7][0xbc] + 0x0c0f93922672363e, 0x2de6e58c03541792, 0x4fdd7fae6c3e7566, 0x6e3409b0491854ca, // [7][0xc0] + 0x8baa4beab2eab08e, 0xaa433df497cc9122, 0xc878a7d6f8a6f3d6, 0xe991d1c8dd80d27a, // [7][0xc4] + 0x379d053057d4a835, 0x1674732e72f28999, 0x744fe90c1d98eb6d, 0x55a69f1238becac1, // [7][0xc8] + 0xb038dd48c34c2e85, 0x91d1ab56e66a0f29, 0xf3ea317489006ddd, 0xd203476aac264c71, // [7][0xcc] + 0x7b2abed6c53f0a28, 0x5ac3c8c8e0192b84, 0x38f852ea8f734970, 0x191124f4aa5568dc, // [7][0xd0] + 0xfc8f66ae51a78c98, 0xdd6610b07481ad34, 0xbf5d8a921bebcfc0, 0x9eb4fc8c3ecdee6c, // [7][0xd4] + 0x40b82874b4999423, 0x61515e6a91bfb58f, 0x036ac448fed5d77b, 0x2283b256dbf3f6d7, // [7][0xd8] + 0xc71df00c20011293, 0xe6f486120527333f, 0x84cf1c306a4d51cb, 0xa5266a2e4f6b7067, // [7][0xdc] + 0xe245c91be0e84e12, 0xc3acbf05c5ce6fbe, 0xa1972527aaa40d4a, 0x807e53398f822ce6, // [7][0xe0] + 0x65e011637470c8a2, 0x4409677d5156e90e, 0x2632fd5f3e3c8bfa, 0x07db8b411b1aaa56, // [7][0xe4] + 0xd9d75fb9914ed019, 0xf83e29a7b468f1b5, 0x9a05b385db029341, 0xbbecc59bfe24b2ed, // [7][0xe8] + 0x5e7287c105d656a9, 0x7f9bf1df20f07705, 0x1da06bfd4f9a15f1, 0x3c491de36abc345d, // [7][0xec] + 0x9560e45f03a57204, 0xb4899241268353a8, 0xd6b2086349e9315c, 0xf75b7e7d6ccf10f0, // [7][0xf0] + 0x12c53c27973df4b4, 0x332c4a39b21bd518, 0x5117d01bdd71b7ec, 0x70fea605f8579640, // [7][0xf4] + 0xaef272fd7203ec0f, 0x8f1b04e35725cda3, 0xed209ec1384faf57, 0xccc9e8df1d698efb, // [7][0xf8] + 0x2957aa85e69b6abf, 0x08bedc9bc3bd4b13, 0x6a8546b9acd729e7, 0x4b6c30a789f1084b // [7][0xfc] + }}; + +/** Slow slice-by-8 lookup table based fallback function to compute CRC64NVME. */ +uint64_t aws_checksums_crc64nvme_sw(const uint8_t *input, int length, uint64_t prev_crc64) { + + if (!input || length <= 0) { + return prev_crc64; + } + + uint64_t crc = ~prev_crc64; + + // Read byte by byte until we reach an 8 byte aligned address + while (length > 0 && ((intptr_t)input & 7)) { + crc = (crc >> 8) ^ crc64nvme_table[0][(crc ^ *input++) & 0xff]; + length--; + } + + int remaining = length; + // Once we are aligned, read 8 bytes at a time + const uint64_t *current = (const uint64_t *)(const void *)input; + while (remaining >= 8) { + uint64_t c1 = *current++ ^ crc; + crc = crc64nvme_table[7][c1 & 0xff]; + crc ^= crc64nvme_table[6][(c1 >> 8) & 0xff]; + crc ^= crc64nvme_table[5][(c1 >> 16) & 0xff]; + crc ^= crc64nvme_table[4][(c1 >> 24) & 0xff]; + crc ^= crc64nvme_table[3][(c1 >> 32) & 0xff]; + crc ^= crc64nvme_table[2][(c1 >> 40) & 0xff]; + crc ^= crc64nvme_table[1][(c1 >> 48) & 0xff]; + crc ^= crc64nvme_table[0][(c1 >> 56) & 0xff]; + remaining -= 8; + } + + // Read any remaining input byte by byte + while (remaining > 0) { + crc = (crc >> 8) ^ crc64nvme_table[0][(crc ^ input[length - remaining]) & 0xff]; + remaining--; + } + + return ~crc; +} diff --git a/source/generic/crc32c_null.c b/source/generic/crc32c_null.c deleted file mode 100644 index b9e06f0..0000000 --- a/source/generic/crc32c_null.c +++ /dev/null @@ -1,18 +0,0 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ -#include - -#include - -/* Fail gracefully. Even though the we might be able to detect the presence of the instruction - * we might not have a compiler that supports assembling those instructions. - */ -uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32c_sw(input, length, previousCrc32); -} - -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c index 35e1d09..21e1e76 100644 --- a/source/intel/asm/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -283,7 +283,7 @@ static bool detected_clmul = false; * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent * call. */ -uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { +uint32_t aws_checksums_crc32c_clmul_sse42(const uint8_t *input, int length, uint32_t previousCrc32) { if (AWS_UNLIKELY(!detection_performed)) { detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); @@ -293,6 +293,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev detection_performed = true; } + /* this is called by a higher-level shim and previousCRC32 is already ~ */ uint32_t crc = ~previousCrc32; /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ @@ -358,22 +359,10 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev return ~crc; } -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} # if defined(__clang__) # pragma clang diagnostic pop # endif -#else -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} - -uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32c_sw(input, length, previousCrc32); -} - -#endif +#endif /* x86_64 */ /* clang-format on */ diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c new file mode 100644 index 0000000..02de641 --- /dev/null +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -0,0 +1,246 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +#if defined(AWS_HAVE_AVX512_INTRINSICS) && defined(AWS_ARCH_INTEL_X64) + +# include + +AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8, 64); +AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16); + +// This macro uses casting to ensure the compiler actually uses the unaligned load instructions +# define load_zmm(ptr) _mm512_loadu_si512((const uint8_t *)(const void *)(ptr)) + +/* + * crc32c_avx512(): compute the crc32c of the buffer, where the buffer + * length must be at least 256, and a multiple of 64. Based on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * V. Gopal, E. Ozturk, et al., 2009, http://download.intel.com/design/intarch/papers/323102.pdf + */ +static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length, uint32_t previous_crc) { + AWS_ASSERT( + length >= 256 && "invariant violated. length must be greater than 255 bytes to use avx512 to compute crc."); + + uint32_t crc = previous_crc; + + /* + * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 + * are similar to those given at the end of the paper + * + * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 + * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 + * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 + * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 + * k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1 + * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1 + */ + + static zalign_8 k1k2[8] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + static zalign_8 k3k4[8] = { + 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8}; + static zalign_8 k9k10[8] = { + 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092}; + static zalign_8 k1k4[8] = { + 0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe, 0x493c7d27, 0x00000000, 0x00000000}; + + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a1; + + /* + * There's at least one block of 256. + */ + x1 = load_zmm(input + 0x00); + x2 = load_zmm(input + 0x40); + x3 = load_zmm(input + 0x80); + x4 = load_zmm(input + 0xC0); + + // Load the crc into a zmm register and XOR with the first 64 bytes of input + x5 = _mm512_inserti32x4(_mm512_setzero_si512(), _mm_cvtsi32_si128((int)crc), 0); + x1 = _mm512_xor_si512(x1, x5); + + x0 = load_zmm(k1k2); + + input += 256; + length -= 256; + + /* + * Parallel fold blocks of 256, if any. + */ + while (length >= 256) { + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); + + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); + + y5 = load_zmm(input + 0x00); + y6 = load_zmm(input + 0x40); + y7 = load_zmm(input + 0x80); + y8 = load_zmm(input + 0xC0); + + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96); + x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96); + x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96); + + input += 256; + length -= 256; + } + + /* + * Fold 256 bytes into 64 bytes. + */ + x0 = load_zmm(k9k10); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96); + + x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96); + + x0 = load_zmm(k3k4); + y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96); + + /* + * Single fold blocks of 64, if any. + */ + while (length >= 64) { + x2 = load_zmm(input); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96); + + input += 64; + length -= 64; + } + + /* + * Fold 512-bits to 128-bits. + */ + x0 = load_zmm(k1k4); + x4 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x3 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x2 = _mm512_xor_si512(x3, x4); + a1 = _mm_xor_si128(_mm512_extracti32x4_epi32(x1, 3), _mm512_extracti32x4_epi32(x2, 0)); + a1 = _mm_ternarylogic_epi64(a1, _mm512_extracti32x4_epi32(x2, 1), _mm512_extracti32x4_epi32(x2, 2), 0x96); + + /* + * Fold 128-bits to 32-bits. + */ + uint64_t val; + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); +} +#endif /* #if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX) */ + +static bool detection_performed = false; +static bool detected_sse42 = false; +static bool detected_avx512 = false; +static bool detected_clmul = false; +static bool detected_vpclmulqdq = false; + +uint32_t aws_checksums_crc32c_intel_avx512_with_sse_fallback(const uint8_t *input, int length, uint32_t previous_crc) { + if (AWS_UNLIKELY(!detection_performed)) { + detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2); + detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); + detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); + detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ); + + /* Simply setting the flag true to skip HW detection next time + Not using memory barriers since the worst that can + happen is a fallback to the non HW accelerated code. */ + detection_performed = true; + } + + /* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and + * branches.*/ + uint32_t crc = ~previous_crc; + + /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ + if (length < (int)sizeof(slice_ptr_int_type)) { + while (length-- > 0) { + crc = (uint32_t)_mm_crc32_u8(crc, *input++); + } + return ~crc; + } + + /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ + int input_alignment = (uintptr_t)(input) & 0x7; + + /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ + int leading = (8 - input_alignment) & 0x7; + + /* reduce the length by the leading unaligned bytes we are about to process */ + length -= leading; + + /* spin through the leading unaligned input bytes (if any) one-by-one */ + while (leading-- > 0) { + crc = (uint32_t)_mm_crc32_u8(crc, *input++); + } + +#if defined(AWS_HAVE_AVX512_INTRINSICS) && defined(AWS_ARCH_INTEL_X64) + int chunk_size = length & ~63; + + if (detected_avx512 && detected_vpclmulqdq && detected_clmul) { + if (length >= 256) { + crc = s_checksums_crc32c_avx512_impl(input, length, crc); + /* check remaining data */ + length -= chunk_size; + if (!length) { + return ~crc; + } + + /* Fall into the default crc32 for the remaining data. */ + input += chunk_size; + } + } +#endif + +#if defined(AWS_ARCH_INTEL_X64) && !defined(_MSC_VER) + if (detected_sse42 && detected_clmul) { + // this function is an entry point on its own. It inverts the crc passed to it + // does its thing and then inverts it upon return. In order to keep + // aws_checksums_crc32c_sse42 a standalone function (which it has to be due + // to the way its implemented) it's better that it doesn't need to know it's used + // in a larger computation fallback. + return aws_checksums_crc32c_clmul_sse42(input, length, ~crc); + } +#endif + + /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ + while (length >= (int)sizeof(slice_ptr_int_type)) { + crc = (uint32_t)crc_intrin_fn(crc, *(slice_ptr_int_type *)(input)); + input += sizeof(slice_ptr_int_type); + length -= (int)sizeof(slice_ptr_int_type); + } + + /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ + while (length-- > 0) { + crc = (uint32_t)_mm_crc32_u8(crc, *input); + input++; + } + + return ~crc; +} diff --git a/source/intel/intrin/crc64nvme_avx512.c b/source/intel/intrin/crc64nvme_avx512.c new file mode 100644 index 0000000..6afbe93 --- /dev/null +++ b/source/intel/intrin/crc64nvme_avx512.c @@ -0,0 +1,130 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include + +#if defined(AWS_HAVE_AVX512_INTRINSICS) && defined(AWS_ARCH_INTEL_X64) + +# include +# include +# include +# include + +# define load_xmm(ptr) _mm_loadu_si128((const __m128i *)(const void *)(ptr)) +# define mask_high_bytes(xmm, count) \ + _mm_and_si128((xmm), load_xmm(aws_checksums_masks_shifts[3] + (intptr_t)(count))) +# define cmull_xmm_hi(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x11) +# define cmull_xmm_lo(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x00) +# define cmull_xmm_pair(xmm1, xmm2) _mm_xor_si128(cmull_xmm_hi((xmm1), (xmm2)), cmull_xmm_lo((xmm1), (xmm2))) +# define xor_xmm(xmm1, xmm2, xmm3) \ + _mm_ternarylogic_epi64((xmm1), (xmm2), (xmm3), 0x96) // The constant 0x96 produces a 3-way XOR + +# define load_zmm(ptr) _mm512_loadu_si512((const uint8_t *)(const void *)(ptr)) +# define cmull_zmm_hi(zmm1, zmm2) _mm512_clmulepi64_epi128((zmm1), (zmm2), 0x11) +# define cmull_zmm_lo(zmm1, zmm2) _mm512_clmulepi64_epi128((zmm1), (zmm2), 0x00) +# define cmull_zmm_pair(zmm1, zmm2) _mm512_xor_si512(cmull_zmm_hi((zmm1), (zmm2)), cmull_zmm_lo((zmm1), (zmm2))) +# define xor_zmm(zmm1, zmm2, zmm3) \ + _mm512_ternarylogic_epi64((zmm1), (zmm2), (zmm3), 0x96) // The constant 0x96 produces a 3-way XOR + +uint64_t aws_checksums_crc64nvme_intel_avx512(const uint8_t *input, int length, const uint64_t previous_crc64) { + + if (length < 512) { + return aws_checksums_crc64nvme_intel_clmul(input, length, previous_crc64); + } + + // The following code assumes a minimum of 256 bytes of input + + // Load the (inverted) CRC into a ZMM register + __m512i x1 = _mm512_inserti32x4(_mm512_setzero_si512(), _mm_cvtsi64_si128((int64_t)~previous_crc64), 0); + // Load the first 64 bytes into a zmm register and XOR with the (inverted) crc + x1 = _mm512_xor_si512(x1, load_zmm(input)); + // Load 192 more bytes of input + __m512i x2 = load_zmm(input + 0x40); + __m512i x3 = load_zmm(input + 0x80); + __m512i x4 = load_zmm(input + 0xc0); + input += 256; + length -= 256; + + const __m512i kp_2048 = load_zmm(aws_checksums_crc64nvme_constants.x2048); + const __m512i kp_512 = load_zmm(aws_checksums_crc64nvme_constants.x512); + + int loops = length / 256; + length &= 255; + + // Parallel fold blocks of 256 bytes, if any + while (loops--) { + x1 = xor_zmm(cmull_zmm_lo(kp_2048, x1), cmull_zmm_hi(kp_2048, x1), load_zmm(input + 0x00)); + x2 = xor_zmm(cmull_zmm_lo(kp_2048, x2), cmull_zmm_hi(kp_2048, x2), load_zmm(input + 0x40)); + x3 = xor_zmm(cmull_zmm_lo(kp_2048, x3), cmull_zmm_hi(kp_2048, x3), load_zmm(input + 0x80)); + x4 = xor_zmm(cmull_zmm_lo(kp_2048, x4), cmull_zmm_hi(kp_2048, x4), load_zmm(input + 0xc0)); + input += 256; + } + + // Fold 2048 bits into 512 bits + const __m512i kp_1536 = load_zmm(aws_checksums_crc64nvme_constants.x1536); + const __m512i kp_1024 = load_zmm(aws_checksums_crc64nvme_constants.x1024); + + x1 = xor_zmm(cmull_zmm_lo(kp_1536, x1), cmull_zmm_hi(kp_1536, x1), cmull_zmm_lo(kp_1024, x2)); + x2 = xor_zmm(cmull_zmm_hi(kp_1024, x2), cmull_zmm_lo(kp_512, x3), cmull_zmm_hi(kp_512, x3)); + x1 = xor_zmm(x1, x2, x4); + + // Fold blocks of 512 bits, if any + loops = length / 64; + length &= 63; + while (loops--) { + x1 = xor_zmm(cmull_zmm_lo(kp_512, x1), cmull_zmm_hi(kp_512, x1), load_zmm(input)); + input += 64; + } + + // Load 64 bytes of constants: x^448, x^384, x^320, x^256, x^192, x^128, N/A, N/A + const __m512i kp_384 = load_zmm(aws_checksums_crc64nvme_constants.x384); + + // Fold 512 bits to 128 bits + x2 = cmull_zmm_pair(kp_384, x1); + __m128i a1 = _mm_xor_si128(_mm512_extracti32x4_epi32(x1, 3), _mm512_extracti32x4_epi32(x2, 0)); + a1 = xor_xmm(a1, _mm512_extracti32x4_epi32(x2, 1), _mm512_extracti32x4_epi32(x2, 2)); + + // Single fold blocks of 128 bits, if any + loops = length / 16; + __m128i kp_128 = _mm512_extracti32x4_epi32(kp_384, 2); + while (loops--) { + a1 = xor_xmm(cmull_xmm_lo(kp_128, a1), cmull_xmm_hi(kp_128, a1), load_xmm(input)); + input += 16; + } + + // The remaining length can be only 0-15 bytes + length &= 15; + + // Load the x^128 constant (note that we don't need x^192). + const __m128i x128 = _mm_set_epi64x(0, aws_checksums_crc64nvme_constants.x128[1]); + if (length == 0) { + // Multiply the lower half of the crc register by x^128 and XOR the result with the upper half of the crc. + a1 = _mm_xor_si128(_mm_bsrli_si128(a1, 8), cmull_xmm_lo(a1, x128)); + } else { + // Handle any trailing input from 1-15 bytes. + __m128i trailing_constants = load_xmm(aws_checksums_crc64nvme_constants.trailing[length - 1]); + // Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input. + a1 = cmull_xmm_pair(a1, trailing_constants); + // Safely load ending at the trailing input and mask out any leading garbage + __m128i trailing_input = mask_high_bytes(load_xmm(input + length - 16), length); + // Multiply the lower half of the trailing input register by x^128 + __m128i mul_by_x128 = cmull_xmm_lo(trailing_input, x128); + // XOR the results with the upper half of the trailing input + a1 = xor_xmm(a1, _mm_bsrli_si128(trailing_input, 8), mul_by_x128); + } + + // Barrett modular reduction + const __m128i mu_poly = load_xmm(&aws_checksums_crc64nvme_constants.mu_poly); + // Multiply the lower half of input by mu + __m128i mul_by_mu = _mm_clmulepi64_si128(mu_poly, a1, 0x00); + // Multiply the lower half of the mul_by_mu result by poly (it's in the upper half) + __m128i mul_by_poly = _mm_clmulepi64_si128(mu_poly, mul_by_mu, 0x01); + // Left shift mul_by_mu to get the low half into the upper half and XOR all the upper halves + __m128i reduced = xor_xmm(a1, _mm_bslli_si128(mul_by_mu, 8), mul_by_poly); + // After the XORs, the CRC falls in the upper half of the register - invert the bits before returning the crc + return ~(uint64_t)_mm_extract_epi64(reduced, 1); +} + +#endif /* defined(AWS_HAVE_AVX512_INTRINSICS) && defined(AWS_ARCH_INTEL_X64)*/ diff --git a/source/intel/intrin/crc64nvme_clmul.c b/source/intel/intrin/crc64nvme_clmul.c new file mode 100644 index 0000000..f141181 --- /dev/null +++ b/source/intel/intrin/crc64nvme_clmul.c @@ -0,0 +1,147 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include +#include + +// msvc compilers older than 2019 are missing some intrinsics. Gate those off. +#if defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920) + +# include +# include +# include +# include + +# define load_xmm(ptr) _mm_loadu_si128((const __m128i *)(const void *)(ptr)) +# define mask_high_bytes(xmm, count) \ + _mm_and_si128((xmm), load_xmm(aws_checksums_masks_shifts[3] + (intptr_t)(count))) +# define cmull_xmm_hi(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x11) +# define cmull_xmm_lo(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x00) +# define cmull_xmm_pair(xmm1, xmm2) _mm_xor_si128(cmull_xmm_hi((xmm1), (xmm2)), cmull_xmm_lo((xmm1), (xmm2))) + +uint64_t aws_checksums_crc64nvme_intel_clmul(const uint8_t *input, int length, uint64_t previous_crc64) { + + // the amount of complexity required to handle vector instructions on + // memory regions smaller than an xmm register does not justify the very negligible performance gains + // we would get for using it on an input this small. + if (length < 16) { + return aws_checksums_crc64nvme_sw(input, length, previous_crc64); + } + + // Invert the previous crc bits and load into the lower half of an xmm register + __m128i a1 = _mm_cvtsi64_si128((int64_t)(~previous_crc64)); + + // There are 16 or more bytes of input - load the first 16 bytes and XOR with the previous crc + a1 = _mm_xor_si128(a1, load_xmm(input)); + input += 16; + length -= 16; + + // Load the folding constants x^128 and x^192 + const __m128i x128 = load_xmm(aws_checksums_crc64nvme_constants.x128); + + if (length >= 48) { + // Load the next 48 bytes + __m128i b1 = load_xmm(input + 0x00); + __m128i c1 = load_xmm(input + 0x10); + __m128i d1 = load_xmm(input + 0x20); + + input += 48; + length -= 48; + + // Load the folding constants x^512 and x^576 + const __m128i x512 = load_xmm(aws_checksums_crc64nvme_constants.x512); + + if (length >= 64) { + // Load the next 64 bytes + __m128i e1 = load_xmm(input + 0x00); + __m128i f1 = load_xmm(input + 0x10); + __m128i g1 = load_xmm(input + 0x20); + __m128i h1 = load_xmm(input + 0x30); + input += 64; + length -= 64; + + // Load the folding constants x^1024 and x^1088 + const __m128i x1024 = load_xmm(aws_checksums_crc64nvme_constants.x1024); + + // Spin through 128 bytes and fold in parallel + int loops = length / 128; + length &= 127; + while (loops--) { + a1 = _mm_xor_si128(cmull_xmm_pair(x1024, a1), load_xmm(input + 0x00)); + b1 = _mm_xor_si128(cmull_xmm_pair(x1024, b1), load_xmm(input + 0x10)); + c1 = _mm_xor_si128(cmull_xmm_pair(x1024, c1), load_xmm(input + 0x20)); + d1 = _mm_xor_si128(cmull_xmm_pair(x1024, d1), load_xmm(input + 0x30)); + e1 = _mm_xor_si128(cmull_xmm_pair(x1024, e1), load_xmm(input + 0x40)); + f1 = _mm_xor_si128(cmull_xmm_pair(x1024, f1), load_xmm(input + 0x50)); + g1 = _mm_xor_si128(cmull_xmm_pair(x1024, g1), load_xmm(input + 0x60)); + h1 = _mm_xor_si128(cmull_xmm_pair(x1024, h1), load_xmm(input + 0x70)); + input += 128; + } + + // Fold 128 to 64 bytes - e1 through h1 fold into a1 through d1 + a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), e1); + b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), f1); + c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), g1); + d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), h1); + } + + if (length & 64) { + a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), load_xmm(input + 0x00)); + b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), load_xmm(input + 0x10)); + c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), load_xmm(input + 0x20)); + d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), load_xmm(input + 0x30)); + input += 64; + } + length &= 63; + + // Load the x^256, x^320, x^384, and x^448 constants + const __m128i x384 = load_xmm(aws_checksums_crc64nvme_constants.x384); + const __m128i x256 = load_xmm(aws_checksums_crc64nvme_constants.x256); + + // Fold 64 bytes to 16 bytes + a1 = _mm_xor_si128(d1, cmull_xmm_pair(x384, a1)); + a1 = _mm_xor_si128(a1, cmull_xmm_pair(x256, b1)); + a1 = _mm_xor_si128(a1, cmull_xmm_pair(x128, c1)); + } + + // Process any remaining chunks of 16 bytes + int loops = length / 16; + while (loops--) { + a1 = _mm_xor_si128(cmull_xmm_pair(a1, x128), load_xmm(input)); + input += 16; + } + + // The remaining length can be only 0-15 bytes + length &= 15; + if (length == 0) { + // Multiply the lower half of the crc register by x^128 (it's in the upper half) + __m128i mul_by_x128 = _mm_clmulepi64_si128(a1, x128, 0x10); + // XOR the result with the upper half of the crc + a1 = _mm_xor_si128(_mm_bsrli_si128(a1, 8), mul_by_x128); + } else { // Handle any trailing input from 1-15 bytes + // Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input + a1 = cmull_xmm_pair(a1, load_xmm(aws_checksums_crc64nvme_constants.trailing[length - 1])); + // Safely load (ending at the trailing input) and mask out any leading garbage + __m128i trailing_input = mask_high_bytes(load_xmm(input + length - 16), length); + // Multiply the lower half of the trailing input register by x^128 (it's in the upper half) + __m128i mul_by_x128 = _mm_clmulepi64_si128(trailing_input, x128, 0x10); + // XOR the results with the upper half of the trailing input + a1 = _mm_xor_si128(a1, _mm_bsrli_si128(trailing_input, 8)); + a1 = _mm_xor_si128(a1, mul_by_x128); + } + + // Barrett modular reduction + const __m128i mu_poly = load_xmm(aws_checksums_crc64nvme_constants.mu_poly); + // Multiply the lower half of input by mu + __m128i mul_by_mu = _mm_clmulepi64_si128(mu_poly, a1, 0x00); + // Multiply the lower half of the mul_by_mu result by poly (it's in the upper half) + __m128i mul_by_poly = _mm_clmulepi64_si128(mu_poly, mul_by_mu, 0x01); + // Left shift mul_by_mu to get the low half into the upper half and XOR all the upper halves + __m128i reduced = _mm_xor_si128(_mm_xor_si128(a1, _mm_bslli_si128(mul_by_mu, 8)), mul_by_poly); + // After the XORs, the CRC falls in the upper half of the register - invert the bits before returning the crc + return ~(uint64_t)_mm_extract_epi64(reduced, 1); +} + +#endif /* defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920) */ diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c deleted file mode 100644 index ca1aca4..0000000 --- a/source/intel/visualc/visualc_crc32c_sse42.c +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ - -#include -#include - -#if defined(_M_X64) || defined(_M_IX86) - -# if defined(_M_X64) -typedef uint64_t *slice_ptr_type; -typedef uint64_t slice_ptr_int_type; -# else -typedef uint32_t *slice_ptr_type; -typedef uint32_t slice_ptr_int_type; -# endif - -/** - * This implements crc32c via the intel sse 4.2 instructions. - * This is separate from the straight asm version, because visual c does not allow - * inline assembly for x64. - */ -uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32) { - uint32_t crc = ~previousCrc32; - int length_to_process = length; - - slice_ptr_type temp = (slice_ptr_type)data; - - /*to eek good performance out of the intel implementation, we need to only hit the hardware - once we are aligned on the byte boundaries we are using. So, peel off a byte at a time until we are - 8 byte aligned (64 bit arch) or 4 byte aligned (32 bit arch) - - first calculate how many bytes we need to burn before we are aligned. - for a 64 bit arch this is: - (8 - ) mod 8 - 32 bit: - (4 - ) mod 4 */ - uint8_t alignment_offset = (sizeof(slice_ptr_int_type) - ((slice_ptr_int_type)temp % sizeof(slice_ptr_int_type))) % - sizeof(slice_ptr_int_type); - - /*for every byte we need to burn off, just do them a byte at a time. - increment the temp pointer by one byte at a time until we get it on an alignment boundary */ - while (alignment_offset != 0 && length_to_process) { - uint8_t *byte_pos = (uint8_t *)temp; - crc = (uint32_t)_mm_crc32_u8(crc, *byte_pos++); - temp = (slice_ptr_type)byte_pos; - --alignment_offset; - --length_to_process; - } - - /*now whatever is left is properly aligned on a boundary*/ - uint32_t slices = length_to_process / sizeof(temp); - uint32_t remainder = length_to_process % sizeof(temp); - - while (slices--) { -# if defined(_M_X64) - crc = (uint32_t)_mm_crc32_u64(crc, *temp++); -# else - crc = _mm_crc32_u32(crc, *temp++); -# endif - } - - /* process the remaining parts that can't be done on the slice size. */ - uint8_t *remainderPos = (uint8_t *)temp; - - while (remainder--) { - crc = (uint32_t)_mm_crc32_u8(crc, *remainderPos++); - } - - return ~crc; -} - -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} -#endif /* x64 || x86 */ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 26b56bf..0eff50e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,5 +9,6 @@ file(GLOB TESTS ${TEST_HDRS} ${TEST_SRC}) add_test_case(test_crc32c) add_test_case(test_crc32) +add_test_case(test_crc64nvme) generate_test_driver(${PROJECT_NAME}-tests) diff --git a/tests/crc64_test.c b/tests/crc64_test.c new file mode 100644 index 0000000..d7b6b18 --- /dev/null +++ b/tests/crc64_test.c @@ -0,0 +1,128 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include +#include +#include + +// The polynomial used for CRC64NVME (in bit-reflected form) +static const uint64_t POLY_CRC64NVME = 0x9a6c9329ac4bc9b5; + +// Any input with the CRC of that input appended should produce this CRC value. (Note: inverting the bits) +static const uint64_t RESIDUE_CRC64NVME = (uint64_t)~0xf310303b2b6f6e42; + +static const uint8_t DATA_32_ZEROS[32] = {0}; +static const uint64_t KNOWN_CRC64NVME_32_ZEROES = 0xCF3473434D4ECF3B; + +static const uint8_t DATA_32_VALUES[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; +static const uint64_t KNOWN_CRC64NVME_32_VALUES = 0xB9D9D4A8492CBD7F; + +static const uint8_t TEST_VECTOR[] = {'1', '2', '3', '4', '5', '6', '7', '8', '9'}; +static const uint64_t KNOWN_CRC64NVME_TEST_VECTOR = 0xAE8B14860A799888; + +typedef uint64_t(crc_fn)(const uint8_t *input, int length, uint64_t previousCrc64); +#define CRC_FUNC_NAME(crc_func) #crc_func, crc_func +#define DATA_NAME(dataset) #dataset, dataset, sizeof(dataset) +#define TEST_BUFFER_SIZE 2048 + 64 + +// Very, very slow reference implementation that computes CRC64NVME. +static uint64_t crc64nvme_reference(const uint8_t *input, int length, const uint64_t previousCrc64) { + uint64_t crc = ~previousCrc64; + while (length-- > 0) { + crc ^= *input++; + for (int j = 8; j > 0; --j) { + crc = (crc >> 1) ^ ((((crc & 1) ^ 1) - 1) & POLY_CRC64NVME); + } + } + return ~crc; +} + +/* Makes sure that the specified crc function produces the expected results for known input and output */ +static int s_test_known_crc( + const char *func_name, + crc_fn *func, + const char *data_name, + const uint8_t *input, + const size_t length, + const uint64_t expected_crc, + const uint64_t expected_residue) { + + uint64_t result = func(input, (int)length, 0); + ASSERT_HEX_EQUALS(expected_crc, result, "%s(%s)", func_name, data_name); + + // Compute the residue of the buffer (the CRC of the buffer plus its CRC) - will always be a constant value + uint64_t residue = func((const uint8_t *)&result, 8, result); // assuming little endian + ASSERT_HEX_EQUALS(expected_residue, residue, "len %d residue %s(%s)", length, func_name, data_name); + + // chain the crc computation so 2 calls each operate on about 1/2 of the buffer + uint64_t crc1 = func(input, (int)(length / 2), 0); + result = func(input + (length / 2), (int)(length - length / 2), crc1); + ASSERT_HEX_EQUALS(expected_crc, result, "chaining %s(%s)", func_name, data_name); + + crc1 = 0; + for (size_t i = 0; i < length; ++i) { + crc1 = func(input + i, 1, crc1); + } + ASSERT_HEX_EQUALS(expected_crc, crc1, "one byte at a time %s(%s)", func_name, data_name); + + return AWS_OP_SUCCESS; +} + +/* helper function that groups crc64nvme tests */ +static int s_test_known_crc64nvme(struct aws_allocator *allocator, const char *func_name, crc_fn *func) { + int res = 0; + + // Quick sanity check of some known CRC values for known input. + res |= s_test_known_crc(func_name, func, DATA_NAME(DATA_32_ZEROS), KNOWN_CRC64NVME_32_ZEROES, RESIDUE_CRC64NVME); + res |= s_test_known_crc(func_name, func, DATA_NAME(DATA_32_VALUES), KNOWN_CRC64NVME_32_VALUES, RESIDUE_CRC64NVME); + res |= s_test_known_crc(func_name, func, DATA_NAME(TEST_VECTOR), KNOWN_CRC64NVME_TEST_VECTOR, RESIDUE_CRC64NVME); + + if (func == crc64nvme_reference) { + // Don't proceed further since we'd just be testing the reference function against itself + return res; + } + + struct aws_byte_buf test_buf; + ASSERT_SUCCESS(aws_byte_buf_init(&test_buf, allocator, TEST_BUFFER_SIZE)); + + // Spin through buffer offsets + for (int off = 0; off < 16; off++) { + // Fill the test buffer with different values for each iteration + aws_byte_buf_write_u8_n(&test_buf, (uint8_t)off + 129, test_buf.capacity - test_buf.len); + uint64_t expected = 0; + int len = 1; + // Spin through input data lengths + for (int i = 0; i < (TEST_BUFFER_SIZE - off) && !res; i++, len++) { + test_buf.buffer[off + i] = (uint8_t)((i + 1) * 131); + // Compute the expected CRC one byte at a time using the reference function + expected = crc64nvme_reference(&test_buf.buffer[off + i], 1, expected); + // Recompute the full CRC of the buffer at each offset and length and compare against expected value + res |= s_test_known_crc( + func_name, func, "test_buffer", &test_buf.buffer[off], len, expected, RESIDUE_CRC64NVME); + } + aws_byte_buf_reset(&test_buf, false); + } + aws_byte_buf_clean_up(&test_buf); + + return res; +} + +/** + * The reference functions are included in these tests to verify that they aren't obviously broken. + */ +static int s_test_crc64nvme(struct aws_allocator *allocator, void *ctx) { + (void)ctx; + + int res = 0; + + res |= s_test_known_crc64nvme(allocator, CRC_FUNC_NAME(crc64nvme_reference)); + res |= s_test_known_crc64nvme(allocator, CRC_FUNC_NAME(aws_checksums_crc64nvme_sw)); + res |= s_test_known_crc64nvme(allocator, CRC_FUNC_NAME(aws_checksums_crc64nvme)); + + return res; +} + +AWS_TEST_CASE(test_crc64nvme, s_test_crc64nvme) diff --git a/tests/crc_test.c b/tests/crc_test.c index c975791..53c94a1 100644 --- a/tests/crc_test.c +++ b/tests/crc_test.c @@ -5,86 +5,152 @@ #include #include + +#include + #include static const uint8_t DATA_32_ZEROS[32] = {0}; -static const uint32_t KNOWN_CRC32_32_ZEROES = 0x190A55AD; -static const uint32_t KNOWN_CRC32C_32_ZEROES = 0x8A9136AA; - static const uint8_t DATA_32_VALUES[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; -static const uint32_t KNOWN_CRC32_32_VALUES = 0x91267E8A; -static const uint32_t KNOWN_CRC32C_32_VALUES = 0x46DD794E; static const uint8_t TEST_VECTOR[] = {'1', '2', '3', '4', '5', '6', '7', '8', '9'}; + +// The polynomial used for CRC32 (in bit-reflected form) +static const uint32_t POLY_CRC32 = 0xedb88320; +// Any input with the CRC32 of that input appended should produce this CRC32 value. (Note: inverting the bits) +static const uint32_t RESIDUE_CRC32 = ~0xdebb20e3; +static const uint32_t KNOWN_CRC32_32_ZEROES = 0x190A55AD; +static const uint32_t KNOWN_CRC32_32_VALUES = 0x91267E8A; static const uint32_t KNOWN_CRC32_TEST_VECTOR = 0xCBF43926; -static const uint32_t KNOWN_CRC32C_TEST_VECTOR = 0xE3069283; -static uint8_t *s_non_mem_aligned_vector; +// The polynomial used for CRC32C (in bit-reflected form) +static const uint32_t POLY_CRC32C = 0x82f63b78; +// Any input with the CRC32c of that input appended should produce this CRC32c value. (Note: inverting the bits) +static const uint32_t RESIDUE_CRC32C = ~0xb798b438; +static const uint32_t KNOWN_CRC32C_32_ZEROES = 0x8A9136AA; +static const uint32_t KNOWN_CRC32C_32_VALUES = 0x46DD794E; +static const uint32_t KNOWN_CRC32C_TEST_VECTOR = 0xE3069283; typedef uint32_t(crc_fn)(const uint8_t *input, int length, uint32_t previousCrc32); #define CRC_FUNC_NAME(crc_func) #crc_func, crc_func #define DATA_NAME(dataset) #dataset, dataset, sizeof(dataset) +#define TEST_BUFFER_SIZE 2048 + 64 + +// Slow reference implementation that computes a 32-bit bit-reflected/bit-inverted CRC using the provided polynomial. +static uint32_t s_crc_32_reference(const uint8_t *input, int length, const uint32_t previousCrc, uint32_t polynomial) { -/* Makes sure that the specified crc function produces the expected results for known input and output*/ -static int s_test_known_crc( + uint32_t crc = ~previousCrc; + while (length-- > 0) { + crc ^= *input++; + for (int j = 8; j > 0; --j) { + crc = (crc >> 1) ^ ((((crc & 1) ^ 1) - 1) & polynomial); + } + } + return ~crc; +} + +// Very, very slow reference implementation that computes a CRC32. +static uint32_t s_crc32_reference(const uint8_t *input, int length, const uint32_t previousCrc) { + return s_crc_32_reference(input, length, previousCrc, POLY_CRC32); +} + +// Very, very slow reference implementation that computes a CRC32c. +static uint32_t s_crc32c_reference(const uint8_t *input, int length, const uint32_t previousCrc) { + return s_crc_32_reference(input, length, previousCrc, POLY_CRC32C); +} + +/* Makes sure that the specified crc function produces the expected results for known input and output */ +static int s_test_known_crc_32( const char *func_name, crc_fn *func, const char *data_name, const uint8_t *input, - size_t length, - uint32_t expected) { + const size_t length, + const uint32_t expected_crc, + const uint32_t expected_residue) { uint32_t result = func(input, (int)length, 0); - ASSERT_HEX_EQUALS(expected, result, "%s(%s)", func_name, data_name); + ASSERT_HEX_EQUALS(expected_crc, result, "%s(%s)", func_name, data_name); - /* chain the crc computation so 2 calls each operate on about 1/2 of the buffer*/ + // Compute the residue of the buffer (the CRC of the buffer plus its CRC) - will always be a constant value + uint32_t residue = (uint32_t)func((const uint8_t *)&result, 4, result); // assuming little endian + ASSERT_HEX_EQUALS(expected_residue, residue, "len %d residue %s(%s)", length, func_name, data_name); + + // chain the crc computation so 2 calls each operate on about 1/2 of the buffer uint32_t crc1 = func(input, (int)(length / 2), 0); result = func(input + (length / 2), (int)(length - length / 2), crc1); - ASSERT_HEX_EQUALS(expected, result, "chaining %s(%s)", func_name, data_name); + ASSERT_HEX_EQUALS(expected_crc, result, "chaining %s(%s)", func_name, data_name); crc1 = 0; for (size_t i = 0; i < length; ++i) { crc1 = func(input + i, 1, crc1); } - - ASSERT_HEX_EQUALS(expected, crc1, "one byte at a time %s(%s)", func_name, data_name); + ASSERT_HEX_EQUALS(expected_crc, crc1, "one byte at a time %s(%s)", func_name, data_name); return AWS_OP_SUCCESS; } +/* helper function that tests increasing input data lengths vs the reference crc function */ +static int s_test_vs_reference_crc_32( + struct aws_allocator *allocator, + uint32_t polynomial, + uint32_t residue, + const char *func_name, + crc_fn *func) { + + int res = 0; + + struct aws_byte_buf test_buf; + ASSERT_SUCCESS(aws_byte_buf_init(&test_buf, allocator, TEST_BUFFER_SIZE)); + + // Spin through buffer offsets + for (int off = 0; off < 16; off++) { + // Fill the test buffer with different values for each iteration + aws_byte_buf_write_u8_n(&test_buf, (uint8_t)off + 129, test_buf.capacity - test_buf.len); + uint32_t expected = 0; + int len = 1; + // Spin through input data lengths + for (int i = 0; i < (TEST_BUFFER_SIZE - off) && !res; i++, len++) { + test_buf.buffer[off + i] = (uint8_t)((i + 1) * 131); + // Compute the expected CRC one byte at a time using the reference function + expected = s_crc_32_reference(&test_buf.buffer[off + i], 1, expected, polynomial); + // Recompute the full CRC of the buffer at each offset and length and compare against expected value + res |= s_test_known_crc_32(func_name, func, "test_buffer", &test_buf.buffer[off], len, expected, residue); + if (res != 0) { + continue; + } + } + aws_byte_buf_reset(&test_buf, false); + } + aws_byte_buf_clean_up(&test_buf); + + return res; +} + /* helper function that groups crc32 tests*/ -static int s_test_known_crc32(const char *func_name, crc_fn *func) { +static int s_test_known_crc32(struct aws_allocator *allocator, const char *func_name, crc_fn *func) { int res = 0; - res |= s_test_known_crc(func_name, func, DATA_NAME(DATA_32_ZEROS), KNOWN_CRC32_32_ZEROES); - res |= s_test_known_crc(func_name, func, DATA_NAME(DATA_32_VALUES), KNOWN_CRC32_32_VALUES); - res |= s_test_known_crc(func_name, func, DATA_NAME(TEST_VECTOR), KNOWN_CRC32_TEST_VECTOR); + res |= s_test_known_crc_32(func_name, func, DATA_NAME(DATA_32_ZEROS), KNOWN_CRC32_32_ZEROES, RESIDUE_CRC32); + res |= s_test_known_crc_32(func_name, func, DATA_NAME(DATA_32_VALUES), KNOWN_CRC32_32_VALUES, RESIDUE_CRC32); + res |= s_test_known_crc_32(func_name, func, DATA_NAME(TEST_VECTOR), KNOWN_CRC32_TEST_VECTOR, RESIDUE_CRC32); + if (func != s_crc32_reference) { + res |= s_test_vs_reference_crc_32(allocator, POLY_CRC32, RESIDUE_CRC32, func_name, func); + } return res; } /* helper function that groups crc32c tests*/ -static int s_test_known_crc32c(const char *func_name, crc_fn *func) { +static int s_test_known_crc32c(struct aws_allocator *allocator, const char *func_name, crc_fn *func) { int res = 0; - res |= s_test_known_crc(func_name, func, DATA_NAME(DATA_32_ZEROS), KNOWN_CRC32C_32_ZEROES); - res |= s_test_known_crc(func_name, func, DATA_NAME(DATA_32_VALUES), KNOWN_CRC32C_32_VALUES); - res |= s_test_known_crc(func_name, func, DATA_NAME(TEST_VECTOR), KNOWN_CRC32C_TEST_VECTOR); - - /*this tests three things, first it tests the case where we aren't 8-byte aligned*/ - /*seconde, it tests that reads aren't performed before start of buffer*/ - /*third, it tests that writes aren't performed after the end of the buffer.*/ - /*if any of those things happen, then the checksum will be wrong and the assertion will fail */ - s_non_mem_aligned_vector = malloc(sizeof(DATA_32_VALUES) + 6); - memset(s_non_mem_aligned_vector, 1, sizeof(DATA_32_VALUES) + 6); - memcpy(s_non_mem_aligned_vector + 3, DATA_32_VALUES, sizeof(DATA_32_VALUES)); - res |= s_test_known_crc( - func_name, - func, - "non_mem_aligned_vector", - s_non_mem_aligned_vector + 3, - sizeof(DATA_32_VALUES), - KNOWN_CRC32C_32_VALUES); - free(s_non_mem_aligned_vector); + res |= s_test_known_crc_32(func_name, func, DATA_NAME(DATA_32_ZEROS), KNOWN_CRC32C_32_ZEROES, RESIDUE_CRC32C); + res |= s_test_known_crc_32(func_name, func, DATA_NAME(DATA_32_VALUES), KNOWN_CRC32C_32_VALUES, RESIDUE_CRC32C); + res |= s_test_known_crc_32(func_name, func, DATA_NAME(TEST_VECTOR), KNOWN_CRC32C_TEST_VECTOR, RESIDUE_CRC32C); + if (func != s_crc32c_reference) { + res |= s_test_vs_reference_crc_32(allocator, POLY_CRC32C, RESIDUE_CRC32C, func_name, func); + } + return res; } @@ -93,24 +159,26 @@ static int s_test_known_crc32c(const char *func_name, crc_fn *func) { * The reference functions are included in these tests to verify that they aren't obviously broken. */ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) { - (void)allocator; (void)ctx; int res = 0; - res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c)); - res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c_sw)); + res |= s_test_known_crc32c(allocator, CRC_FUNC_NAME(s_crc32c_reference)); + res |= s_test_known_crc32c(allocator, CRC_FUNC_NAME(aws_checksums_crc32c_sw)); + res |= s_test_known_crc32c(allocator, CRC_FUNC_NAME(aws_checksums_crc32c)); return res; } AWS_TEST_CASE(test_crc32c, s_test_crc32c) static int s_test_crc32(struct aws_allocator *allocator, void *ctx) { - (void)allocator; (void)ctx; int res = 0; - res |= s_test_known_crc32(CRC_FUNC_NAME(aws_checksums_crc32)); + + res |= s_test_known_crc32(allocator, CRC_FUNC_NAME(s_crc32_reference)); + res |= s_test_known_crc32(allocator, CRC_FUNC_NAME(aws_checksums_crc32_sw)); + res |= s_test_known_crc32(allocator, CRC_FUNC_NAME(aws_checksums_crc32)); return res; }