diff --git a/sse2neon.h b/sse2neon.h
index 995c61ec..56254b5f 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -8500,12 +8500,47 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
     crc = __crc32cb(crc, v);
 #else
     crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
+#if defined(__ARM_FEATURE_CRYPTO)
+    // Adapted from: https://mary.rs/lab/crc32/
+    // Barrent reduction
+    uint64x2_t orig =
+        vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
+    uint64x2_t tmp = orig;
+
+    // Polynomial P(x) of CRC32C
+    uint64_t p = 0x105EC76F1;
+    // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
+    // 2^{64} / P(x) \rfloor = 0x11f91caf6
+    uint64_t mu = 0x1dea713f1;
+
+    // Multiply by mu_{64}
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
+    // Divide by 2^{64} (mask away the unnecessary bits)
+    tmp =
+        vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
+    // Multiply by P(x) (shifted left by 1 for alignment reasons)
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
+    // Subtract original from result
+    tmp = veorq_u64(tmp, orig);
+
+    // Extract the 'lower' (in bit-reflected sense) 32 bits
+    crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
+#else  // Fall back to the generic table lookup approach
+    // Adapted from: https://create.stephan-brumme.com/crc32/
+    // Apply half-byte comparision algorithm for the best ratio between
+    // performance and lookup table.
+
+    // The lookup table just needs to store every 16th entry
+    // of the standard look-up table.
+    static const uint32_t crc32_half_byte_tbl[] = {
+        0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
+        0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
+        0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
+    };
+
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+#endif
 #endif
     return crc;
 }
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 22170608..7414bdcc 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -530,7 +530,7 @@ uint32_t canonical_crc32_u8(uint32_t crc, uint8_t v)
     crc ^= v;
     for (int bit = 0; bit < 8; bit++) {
         if (crc & 1)
-            crc = (crc >> 1) ^ uint32_t(0x82f63b78);
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
         else
             crc = (crc >> 1);
     }