From 4b9e1095fbb2a63804680c88597ecd1dc6ce58f8 Mon Sep 17 00:00:00 2001 From: Valaphee <32491319+valaphee@users.noreply.github.com> Date: Sat, 11 Nov 2023 21:07:09 +0100 Subject: [PATCH] Adding support for crc8, crc16 and crc32 bit-reflected simd, x86 targets --- benches/bench.rs | 12 +++++++- src/crc16.rs | 13 ++++++++- src/crc16/simd.rs | 72 +++++++++++++++++++++++++++++++++++++++++++++ src/crc32.rs | 66 ++++++++++++++++++++++++++++++++++++------ src/crc32/simd.rs | 69 +++++++++++++++++++++++++++++++++++++++++++ src/crc64.rs | 2 +- src/crc8.rs | 13 ++++++++- src/crc8/simd.rs | 72 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 12 +++++++- src/simd.rs | 17 +++++++++++ src/simd/x86.rs | 74 +++++++++++++++++++++++++++++++++++++++++++++++ src/table.rs | 48 ++++++++++++++++++++++++++++++ 12 files changed, 456 insertions(+), 14 deletions(-) create mode 100644 src/crc16/simd.rs create mode 100644 src/crc32/simd.rs create mode 100644 src/crc8/simd.rs create mode 100644 src/simd.rs create mode 100644 src/simd/x86.rs diff --git a/benches/bench.rs b/benches/bench.rs index b3fa134..6a58904 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -5,14 +5,17 @@ pub const BLUETOOTH: Crc = Crc::::new(&CRC_8_BLUETOOTH); pub const BLUETOOTH_SLICE16: Crc> = Crc::>::new(&CRC_8_BLUETOOTH); pub const BLUETOOTH_BYTEWISE: Crc> = Crc::>::new(&CRC_8_BLUETOOTH); pub const BLUETOOTH_NOLOOKUP: Crc> = Crc::>::new(&CRC_8_BLUETOOTH); +pub const BLUETOOTH_SIMD: Crc> = Crc::>::new(&CRC_8_BLUETOOTH); pub const X25: Crc = Crc::::new(&CRC_16_IBM_SDLC); pub const X25_SLICE16: Crc> = Crc::>::new(&CRC_16_IBM_SDLC); pub const X25_BYTEWISE: Crc> = Crc::>::new(&CRC_16_IBM_SDLC); pub const X25_NOLOOKUP: Crc> = Crc::>::new(&CRC_16_IBM_SDLC); +pub const X25_SIMD: Crc> = Crc::>::new(&CRC_16_IBM_SDLC); pub const ISCSI: Crc = Crc::::new(&CRC_32_ISCSI); pub const ISCSI_SLICE16: Crc> = Crc::>::new(&CRC_32_ISCSI); pub const ISCSI_BYTEWISE: Crc> = Crc::>::new(&CRC_32_ISCSI); pub const ISCSI_NOLOOKUP: Crc> = Crc::>::new(&CRC_32_ISCSI); +pub const ISCSI_SIMD: Crc> = Crc::>::new(&CRC_32_ISCSI); pub const GSM_40: Crc = Crc::::new(&CRC_40_GSM); pub const ECMA: Crc = Crc::::new(&CRC_64_ECMA_182); pub const ECMA_SLICE16: Crc> = Crc::>::new(&CRC_64_ECMA_182); @@ -51,6 +54,9 @@ fn checksum(c: &mut Criterion) { }) .bench_function("slice16", |b| { b.iter(|| BLUETOOTH_SLICE16.checksum(black_box(&bytes))) + }) + .bench_function("simd", |b| { + b.iter(|| BLUETOOTH_SIMD.checksum(black_box(&bytes))) }); c.benchmark_group("crc16") @@ -64,7 +70,8 @@ fn checksum(c: &mut Criterion) { }) .bench_function("slice16", |b| { b.iter(|| X25_SLICE16.checksum(black_box(&bytes))) - }); + }) + .bench_function("simd", |b| b.iter(|| X25_SIMD.checksum(black_box(&bytes)))); c.benchmark_group("crc32") .throughput(Throughput::Bytes(size as u64)) @@ -77,6 +84,9 @@ fn checksum(c: &mut Criterion) { }) .bench_function("slice16", |b| { b.iter(|| ISCSI_SLICE16.checksum(black_box(&bytes))) + }) + .bench_function("simd", |b| { + b.iter(|| ISCSI_SIMD.checksum(black_box(&bytes))) }); c.benchmark_group("crc64") diff --git a/src/crc16.rs b/src/crc16.rs index 9001f2f..427906c 100644 --- a/src/crc16.rs +++ b/src/crc16.rs @@ -4,6 +4,7 @@ use crc_catalog::Algorithm; mod bytewise; mod default; mod nolookup; +mod simd; mod slice16; const fn init(algorithm: &Algorithm, initial: u16) -> u16 { @@ -141,7 +142,7 @@ const fn update_slice16( #[cfg(test)] mod test { - use crate::{Bytewise, Crc, Implementation, NoTable, Slice16}; + use crate::{Bytewise, Crc, Implementation, NoTable, Simd, Slice16}; use crc_catalog::{Algorithm, CRC_16_IBM_SDLC}; #[test] @@ -265,11 +266,13 @@ mod test { for data in data { let crc_slice16 = Crc::>::new(alg); let crc_nolookup = Crc::>::new(alg); + let crc_simd = Crc::>::new(alg); let expected = Crc::>::new(alg).checksum(data.as_bytes()); // Check that doing all at once works as expected assert_eq!(crc_slice16.checksum(data.as_bytes()), expected); assert_eq!(crc_nolookup.checksum(data.as_bytes()), expected); + assert_eq!(crc_simd.checksum(data.as_bytes()), expected); let mut digest = crc_slice16.digest(); digest.update(data.as_bytes()); @@ -279,6 +282,10 @@ mod test { digest.update(data.as_bytes()); assert_eq!(digest.finalize(), expected); + let mut digest = crc_simd.digest(); + digest.update(data.as_bytes()); + assert_eq!(digest.finalize(), expected); + // Check that we didn't break updating from multiple sources if data.len() > 2 { let data = data.as_bytes(); @@ -292,6 +299,10 @@ mod test { digest.update(data1); digest.update(data2); assert_eq!(digest.finalize(), expected); + let mut digest = crc_simd.digest(); + digest.update(data1); + digest.update(data2); + assert_eq!(digest.finalize(), expected); } } } diff --git a/src/crc16/simd.rs b/src/crc16/simd.rs new file mode 100644 index 0000000..1387714 --- /dev/null +++ b/src/crc16/simd.rs @@ -0,0 +1,72 @@ +use crate::crc32::update_simd; +use crate::simd::{SimdValue, SimdValueOps}; +use crate::table::{crc16_table_slice_16, crc32_simd_coefficients}; +use crate::{Algorithm, Crc, Digest, Simd}; + +use super::{finalize, init, update_slice16}; + +impl Crc> { + pub const fn new(algorithm: &'static Algorithm) -> Self { + Self { + algorithm, + table: ( + crc16_table_slice_16(algorithm.width, algorithm.poly, algorithm.refin), + unsafe { + // SAFETY: SimdValue is the same as u64x2 and this only changes the representation of 8*u64 to 4*u64x2. + core::mem::transmute(crc32_simd_coefficients( + algorithm.width, + algorithm.poly as u32, + )) + }, + ), + } + } + + pub fn checksum(&self, bytes: &[u8]) -> u16 { + let mut crc = init(self.algorithm, self.algorithm.init); + crc = self.update(crc, bytes); + finalize(self.algorithm, crc) + } + + fn update(&self, mut crc: u16, bytes: &[u8]) -> u16 { + if !SimdValue::is_supported() || !self.algorithm.refin { + return update_slice16(crc, self.algorithm.refin, &self.table.0, bytes); + } + + let (bytes_before, chunks, bytes_after) = unsafe { bytes.align_to::<[SimdValue; 4]>() }; + crc = update_slice16(crc, self.algorithm.refin, &self.table.0, bytes_before); + if let Some(first_chunk) = chunks.first() { + // SAFETY: All required features are supported, by checking SimdValue::is_supported. + crc = unsafe { update_simd(crc as u32, &self.table.1, first_chunk, chunks) } as u16; + } + update_slice16(crc, self.algorithm.refin, &self.table.0, bytes_after) + } + + pub fn digest(&self) -> Digest> { + self.digest_with_initial(self.algorithm.init) + } + + /// Construct a `Digest` with a given initial value. + /// + /// This overrides the initial value specified by the algorithm. + /// The effects of the algorithm's properties `refin` and `width` + /// are applied to the custom initial value. + pub fn digest_with_initial(&self, initial: u16) -> Digest> { + let value = init(self.algorithm, initial); + Digest::new(self, value) + } +} + +impl<'a> Digest<'a, Simd> { + const fn new(crc: &'a Crc>, value: u16) -> Self { + Digest { crc, value } + } + + pub fn update(&mut self, bytes: &[u8]) { + self.value = self.crc.update(self.value, bytes); + } + + pub const fn finalize(self) -> u16 { + finalize(self.crc.algorithm, self.value) + } +} diff --git a/src/crc32.rs b/src/crc32.rs index 0467533..fd4ec28 100644 --- a/src/crc32.rs +++ b/src/crc32.rs @@ -1,9 +1,11 @@ +use crate::simd::{SimdValue, SimdValueOps}; use crate::util::crc32; use crc_catalog::Algorithm; mod bytewise; mod default; mod nolookup; +mod simd; mod slice16; // init is shared between all impls @@ -150,9 +152,45 @@ const fn update_slice16( crc } +#[target_feature(enable = "sse2", enable = "sse4.1", enable = "pclmulqdq")] +pub(crate) unsafe fn update_simd( + crc: u32, + coefficients: &[SimdValue; 4], + first_chunk: &[SimdValue; 4], + chunks: &[[SimdValue; 4]], +) -> u32 { + let mut x4 = *first_chunk; + + // Apply initial crc value + x4[0] = x4[0].xor(crc as u64); + + // Iteratively Fold by 4: + let k1_k2 = coefficients[0]; + for chunk in chunks { + for (x, value) in x4.iter_mut().zip(chunk.iter()) { + *x = x.fold_16(k1_k2, *value) + } + } + + // Iteratively Fold by 1: + let k3_k4 = coefficients[1]; + let mut x = x4[0].fold_16(k3_k4, x4[1]); + x = x.fold_16(k3_k4, x4[2]); + x = x.fold_16(k3_k4, x4[3]); + + // Final Reduction of 128-bits + let k5_k6 = coefficients[2]; + x = x.fold_8(k3_k4); + x = x.fold_4(k5_k6); + + // Barret Reduction + let px_u = coefficients[3]; + x.barret_reduction_32(px_u) +} + #[cfg(test)] mod test { - use crate::{Bytewise, Crc, Implementation, NoTable, Slice16}; + use crate::{Bytewise, Crc, Implementation, NoTable, Simd, Slice16}; use crc_catalog::{Algorithm, CRC_32_ISCSI}; #[test] @@ -250,14 +288,14 @@ mod test { #[test] fn correctness() { let data: &[&str] = &[ - "", - "1", - "1234", - "123456789", - "0123456789ABCDE", - "01234567890ABCDEFGHIJK", - "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", - ]; + "", + "1", + "1234", + "123456789", + "0123456789A", + "01234567890ABCDEFGHIJK", + "01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK", + ]; pub const CRC_32_ISCSI_NONREFLEX: Algorithm = Algorithm { width: 32, @@ -277,11 +315,13 @@ mod test { for data in data { let crc_slice16 = Crc::>::new(alg); let crc_nolookup = Crc::>::new(alg); + let crc_simd = Crc::>::new(alg); let expected = Crc::>::new(alg).checksum(data.as_bytes()); // Check that doing all at once works as expected assert_eq!(crc_slice16.checksum(data.as_bytes()), expected); assert_eq!(crc_nolookup.checksum(data.as_bytes()), expected); + assert_eq!(crc_simd.checksum(data.as_bytes()), expected); let mut digest = crc_slice16.digest(); digest.update(data.as_bytes()); @@ -291,6 +331,10 @@ mod test { digest.update(data.as_bytes()); assert_eq!(digest.finalize(), expected); + let mut digest = crc_simd.digest(); + digest.update(data.as_bytes()); + assert_eq!(digest.finalize(), expected); + // Check that we didn't break updating from multiple sources if data.len() > 2 { let data = data.as_bytes(); @@ -304,6 +348,10 @@ mod test { digest.update(data1); digest.update(data2); assert_eq!(digest.finalize(), expected); + let mut digest = crc_simd.digest(); + digest.update(data1); + digest.update(data2); + assert_eq!(digest.finalize(), expected); } } } diff --git a/src/crc32/simd.rs b/src/crc32/simd.rs new file mode 100644 index 0000000..21f18b1 --- /dev/null +++ b/src/crc32/simd.rs @@ -0,0 +1,69 @@ +use crate::simd::{SimdValue, SimdValueOps}; +use crate::table::{crc32_simd_coefficients, crc32_table_slice_16}; +use crate::{Algorithm, Crc, Digest, Simd}; + +use super::{finalize, init, update_simd, update_slice16}; + +impl Crc> { + pub const fn new(algorithm: &'static Algorithm) -> Self { + Self { + algorithm, + table: ( + crc32_table_slice_16(algorithm.width, algorithm.poly, algorithm.refin), + unsafe { + // SAFETY: Both represent numbers + core::mem::transmute(crc32_simd_coefficients(algorithm.width, algorithm.poly)) + }, + ), + } + } + + pub fn checksum(&self, bytes: &[u8]) -> u32 { + let mut crc = init(self.algorithm, self.algorithm.init); + crc = self.update(crc, bytes); + finalize(self.algorithm, crc) + } + + fn update(&self, mut crc: u32, bytes: &[u8]) -> u32 { + if !SimdValue::is_supported() || !self.algorithm.refin { + return update_slice16(crc, self.algorithm.refin, &self.table.0, bytes); + } + + // SAFETY: Both represent numbers + let (bytes_before, chunks, bytes_after) = unsafe { bytes.align_to::<[SimdValue; 4]>() }; + crc = update_slice16(crc, self.algorithm.refin, &self.table.0, bytes_before); + if let Some(first_chunk) = chunks.first() { + // SAFETY: All required features are supported, by checking SimdValue::is_supported. + crc = unsafe { update_simd(crc, &self.table.1, first_chunk, &chunks[1..]) }; + } + update_slice16(crc, self.algorithm.refin, &self.table.0, bytes_after) + } + + pub fn digest(&self) -> Digest> { + self.digest_with_initial(self.algorithm.init) + } + + /// Construct a `Digest` with a given initial value. + /// + /// This overrides the initial value specified by the algorithm. + /// The effects of the algorithm's properties `refin` and `width` + /// are applied to the custom initial value. + pub fn digest_with_initial(&self, initial: u32) -> Digest> { + let value = init(self.algorithm, initial); + Digest::new(self, value) + } +} + +impl<'a> Digest<'a, Simd> { + const fn new(crc: &'a Crc>, value: u32) -> Self { + Digest { crc, value } + } + + pub fn update(&mut self, bytes: &[u8]) { + self.value = self.crc.update(self.value, bytes); + } + + pub const fn finalize(self) -> u32 { + finalize(self.crc.algorithm, self.value) + } +} diff --git a/src/crc64.rs b/src/crc64.rs index 9abafd2..186531e 100644 --- a/src/crc64.rs +++ b/src/crc64.rs @@ -154,7 +154,7 @@ const fn update_slice16( #[cfg(test)] mod test { - use crate::{Bytewise, Crc, Implementation, NoTable, Slice16}; + use crate::{Bytewise, Crc, Implementation, NoTable, Simd, Slice16}; use crc_catalog::{Algorithm, CRC_64_ECMA_182}; #[test] diff --git a/src/crc8.rs b/src/crc8.rs index d33d906..73aaea8 100644 --- a/src/crc8.rs +++ b/src/crc8.rs @@ -4,6 +4,7 @@ use crc_catalog::Algorithm; mod bytewise; mod default; mod nolookup; +mod simd; mod slice16; const fn init(algorithm: &Algorithm, initial: u8) -> u8 { @@ -88,7 +89,7 @@ const fn update_slice16(mut crc: u8, table: &[[u8; 256]; 16], bytes: &[u8]) -> u #[cfg(test)] mod test { - use crate::{Bytewise, Crc, Implementation, NoTable, Slice16}; + use crate::{Bytewise, Crc, Implementation, NoTable, Simd, Slice16}; use crc_catalog::{Algorithm, CRC_8_BLUETOOTH}; #[test] @@ -212,11 +213,13 @@ mod test { for data in data { let crc_slice16 = Crc::>::new(alg); let crc_nolookup = Crc::>::new(alg); + let crc_simd: Crc> = Crc::>::new(alg); let expected = Crc::>::new(alg).checksum(data.as_bytes()); // Check that doing all at once works as expected assert_eq!(crc_slice16.checksum(data.as_bytes()), expected); assert_eq!(crc_nolookup.checksum(data.as_bytes()), expected); + assert_eq!(crc_simd.checksum(data.as_bytes()), expected); let mut digest = crc_slice16.digest(); digest.update(data.as_bytes()); @@ -226,6 +229,10 @@ mod test { digest.update(data.as_bytes()); assert_eq!(digest.finalize(), expected); + let mut digest = crc_simd.digest(); + digest.update(data.as_bytes()); + assert_eq!(digest.finalize(), expected); + // Check that we didn't break updating from multiple sources if data.len() > 2 { let data = data.as_bytes(); @@ -239,6 +246,10 @@ mod test { digest.update(data1); digest.update(data2); assert_eq!(digest.finalize(), expected); + let mut digest = crc_simd.digest(); + digest.update(data1); + digest.update(data2); + assert_eq!(digest.finalize(), expected); } } } diff --git a/src/crc8/simd.rs b/src/crc8/simd.rs new file mode 100644 index 0000000..cda6dc0 --- /dev/null +++ b/src/crc8/simd.rs @@ -0,0 +1,72 @@ +use crate::crc32::update_simd; +use crate::simd::{SimdValue, SimdValueOps}; +use crate::table::{crc32_simd_coefficients, crc8_table_slice_16}; +use crate::{Algorithm, Crc, Digest, Simd}; + +use super::{finalize, init, update_slice16}; + +impl Crc> { + pub const fn new(algorithm: &'static Algorithm) -> Self { + Self { + algorithm, + table: ( + crc8_table_slice_16(algorithm.width, algorithm.poly, algorithm.refin), + unsafe { + // SAFETY: SimdValue is the same as u64x2 and this only changes the representation of 8*u64 to 4*u64x2. + core::mem::transmute(crc32_simd_coefficients( + algorithm.width, + algorithm.poly as u32, + )) + }, + ), + } + } + + pub fn checksum(&self, bytes: &[u8]) -> u8 { + let mut crc = init(self.algorithm, self.algorithm.init); + crc = self.update(crc, bytes); + finalize(self.algorithm, crc) + } + + fn update(&self, mut crc: u8, bytes: &[u8]) -> u8 { + if !SimdValue::is_supported() { + return update_slice16(crc, &self.table.0, bytes); + } + + let (bytes_before, chunks, bytes_after) = unsafe { bytes.align_to::<[SimdValue; 4]>() }; + crc = update_slice16(crc, &self.table.0, bytes_before); + if let Some(first_chunk) = chunks.first() { + // SAFETY: All required features are supported, by checking SimdValue::is_supported. + crc = unsafe { update_simd(crc as u32, &self.table.1, first_chunk, chunks) } as u8; + } + update_slice16(crc, &self.table.0, bytes_after) + } + + pub fn digest(&self) -> Digest> { + self.digest_with_initial(self.algorithm.init) + } + + /// Construct a `Digest` with a given initial value. + /// + /// This overrides the initial value specified by the algorithm. + /// The effects of the algorithm's properties `refin` and `width` + /// are applied to the custom initial value. + pub fn digest_with_initial(&self, initial: u8) -> Digest> { + let value = init(self.algorithm, initial); + Digest::new(self, value) + } +} + +impl<'a> Digest<'a, Simd> { + const fn new(crc: &'a Crc>, value: u8) -> Self { + Digest { crc, value } + } + + pub fn update(&mut self, bytes: &[u8]) { + self.value = self.crc.update(self.value, bytes); + } + + pub const fn finalize(self) -> u8 { + finalize(self.crc.algorithm, self.value) + } +} diff --git a/src/lib.rs b/src/lib.rs index 024e799..5e11f4f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,7 @@ //! assert_eq!(digest.finalize(), 0xaee7); //! ``` #![no_std] -#![forbid(unsafe_code)] +//#![forbid(unsafe_code)] pub use crc_catalog::algorithm::*; pub use crc_catalog::{Algorithm, Width}; @@ -36,6 +36,7 @@ mod crc16; mod crc32; mod crc64; mod crc8; +mod simd; mod table; mod util; @@ -48,9 +49,13 @@ pub struct Bytewise(core::marker::PhantomData); /// Implementation using no lookup table. Use it with `Crc>` pub struct NoTable(core::marker::PhantomData); +/// Implementation using platform-specific simd instructions. Use it with `Crc>` +pub struct Simd(core::marker::PhantomData); + impl crate::private::Sealed for Slice16 {} impl crate::private::Sealed for Bytewise {} impl crate::private::Sealed for NoTable {} +impl crate::private::Sealed for Simd {} impl crate::Implementation for Slice16 { type Width = W; @@ -67,6 +72,11 @@ impl crate::Implementation for NoTable { type Table = (); } +impl crate::Implementation for Simd { + type Width = W; + type Table = ([[W; 256]; 16], [simd::SimdValue; 4]); +} + mod private { pub trait Sealed {} impl Sealed for W {} diff --git a/src/simd.rs b/src/simd.rs new file mode 100644 index 0000000..5b8b3d0 --- /dev/null +++ b/src/simd.rs @@ -0,0 +1,17 @@ +mod x86; + +pub(crate) trait SimdValueOps { + fn is_supported() -> bool; + + unsafe fn xor(self, value: u64) -> Self; + + unsafe fn fold_16(self, x_mod_p: Self, value: Self) -> Self; + + unsafe fn fold_8(self, x_mod_p: Self) -> Self; + + unsafe fn fold_4(self, x_mod_p: Self) -> Self; + + unsafe fn barret_reduction_32(self, px_u: Self) -> u32; +} + +pub(crate) use x86::SimdValue; diff --git a/src/simd/x86.rs b/src/simd/x86.rs new file mode 100644 index 0000000..590ea67 --- /dev/null +++ b/src/simd/x86.rs @@ -0,0 +1,74 @@ +use crate::simd::SimdValueOps; + +#[cfg(target_arch = "x86")] +use core::arch::x86 as arch; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64 as arch; +use core::mem; + +#[derive(Copy, Clone)] +pub struct SimdValue(arch::__m128i); + +impl SimdValueOps for SimdValue { + fn is_supported() -> bool { + cfg!(target_feature = "pclmulqdq") + && cfg!(target_feature = "sse2") + && cfg!(target_feature = "sse4.1") + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn xor(self, value: u64) -> Self { + Self(arch::_mm_xor_si128( + self.0, + arch::_mm_set_epi64x(0, value as i64), + )) + } + + #[inline] + #[target_feature(enable = "sse2", enable = "pclmulqdq")] + unsafe fn fold_16(self, x_mod_p: Self, value: Self) -> Self { + Self(arch::_mm_xor_si128( + arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x00), + arch::_mm_xor_si128(arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x11), value.0), + )) + } + + #[inline] + #[target_feature(enable = "sse2", enable = "pclmulqdq")] + unsafe fn fold_8(self, x_mod_p: Self) -> Self { + Self(arch::_mm_xor_si128( + arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x10), + arch::_mm_srli_si128(self.0, 8), + )) + } + + #[inline] + #[target_feature(enable = "sse2", enable = "pclmulqdq")] + unsafe fn fold_4(self, x_mod_p: Self) -> Self { + Self(arch::_mm_xor_si128( + arch::_mm_clmulepi64_si128( + arch::_mm_and_si128(self.0, mem::transmute((1u128 << 32) - 1)), + x_mod_p.0, + 0x00, + ), + arch::_mm_srli_si128(self.0, 4), + )) + } + + #[inline] + #[target_feature(enable = "sse2", enable = "sse4.1", enable = "pclmulqdq")] + unsafe fn barret_reduction_32(self, px_u: Self) -> u32 { + let t1 = arch::_mm_clmulepi64_si128( + arch::_mm_and_si128(self.0, mem::transmute((1u128 << 32) - 1)), + px_u.0, + 0x10, + ); + let t2 = arch::_mm_clmulepi64_si128( + arch::_mm_and_si128(t1, mem::transmute((1u128 << 32) - 1)), + px_u.0, + 0x00, + ); + arch::_mm_extract_epi32(arch::_mm_xor_si128(self.0, t2), 1) as u32 + } +} diff --git a/src/table.rs b/src/table.rs index fcf8353..2fc1d01 100644 --- a/src/table.rs +++ b/src/table.rs @@ -144,6 +144,54 @@ pub(crate) const fn crc32_table_slice_16(width: u8, poly: u32, reflect: bool) -> table } +pub(crate) const fn crc32_simd_coefficients(width: u8, poly: u32) -> [u64; 8] { + const fn xt_mod_px(mut t: u32, px: u64) -> u64 { + if t < 32 { + return 0; + } + t -= 31; + + let mut n = 0x80000000; + let mut i = 0; + while i < t { + n <<= 1; + if n & 0x100000000 != 0 { + n ^= px; + } + i += 1; + } + n << 32 + } + + const fn u(px: u64) -> u64 { + let mut q = 0; + let mut n = 0x100000000; + let mut i = 0; + while i < 33 { + q <<= 1; + if n & 0x100000000 != 0 { + q |= 1; + n ^= px; + } + n <<= 1; + i += 1; + } + q + } + + let px = (poly as u64) << (u32::BITS as u8 - width); + [ + xt_mod_px(4 * 128 + 32, px).reverse_bits() << 1, + xt_mod_px(4 * 128 - 32, px).reverse_bits() << 1, + xt_mod_px(128 + 32, px).reverse_bits() << 1, + xt_mod_px(128 - 32, px).reverse_bits() << 1, + xt_mod_px(64, px).reverse_bits() << 1, + xt_mod_px(32, px).reverse_bits() << 1, + px.reverse_bits() >> 31, + u(px).reverse_bits() >> 31, + ] +} + pub(crate) const fn crc64_table(width: u8, poly: u64, reflect: bool) -> [u64; 256] { let poly = if reflect { let poly = poly.reverse_bits();