From 04e96f708fb4ea8b7fc4f664229228ee9ee7868a Mon Sep 17 00:00:00 2001 From: Cuda-Chen Date: Tue, 30 Jan 2024 20:51:12 +0800 Subject: [PATCH] Apply suggestion --- sse2neon.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sse2neon.h b/sse2neon.h index 3e4bda1c..def9804b 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -8500,7 +8500,8 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) crc = __crc32cb(crc, v); #else crc ^= v; -#if defined(__ARM_FEATURE_CRYPTO) +#if (defined(__ARM_FEATURE_CRYPTO) || \ + (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))) // Adapted from: https://mary.rs/lab/crc32/ // Barrent reduction uint64x2_t orig = @@ -8509,8 +8510,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) // Polynomial P(x) of CRC32C uint64_t p = 0x105EC76F1; - // Barrett Reduction constant mu_{64} = \lfloor 2^{64} / P(x) \rfloor = - // 0x11f91caf6 in bit-reflected form + // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor 2^{64} / P(x) \rfloor = 0x11f91caf6 uint64_t mu = 0x1dea713f1; // Multiply by mu_{64} @@ -8525,7 +8525,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) // Extract the 'lower' (in bit-reflected sense) 32 bits crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1); -#else +#else // Fall back to the generic table lookup approach // Adapted from: https://create.stephan-brumme.com/crc32/ // Apply half-byte comparision algorithm for the best ratio between // performance and lookup table.