Skip to content

Commit

Permalink
Implement clmul for crc8, 16, 32, test for all cases in the correctne…
Browse files Browse the repository at this point in the history
…ss tests (as the all test doesn't work for simd because of the amount of data it needs)
  • Loading branch information
valaphee committed Apr 3, 2024
1 parent f336f1e commit 79e1c40
Show file tree
Hide file tree
Showing 12 changed files with 671 additions and 80 deletions.
12 changes: 12 additions & 0 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ pub const BLUETOOTH: Crc<u8> = Crc::<u8>::new(&CRC_8_BLUETOOTH);
pub const BLUETOOTH_SLICE16: Crc<u8, Table<16>> = Crc::<u8, Table<16>>::new(&CRC_8_BLUETOOTH);
pub const BLUETOOTH_BYTEWISE: Crc<u8, Table<1>> = Crc::<u8, Table<1>>::new(&CRC_8_BLUETOOTH);
pub const BLUETOOTH_NOLOOKUP: Crc<u8, NoTable> = Crc::<u8, NoTable>::new(&CRC_8_BLUETOOTH);
pub const BLUETOOTH_CLMUL: Crc<u8, Clmul> = Crc::<u8, Clmul>::new(&CRC_8_BLUETOOTH);
pub const X25: Crc<u16> = Crc::<u16>::new(&CRC_16_IBM_SDLC);
pub const X25_SLICE16: Crc<u16, Table<16>> = Crc::<u16, Table<16>>::new(&CRC_16_IBM_SDLC);
pub const X25_BYTEWISE: Crc<u16, Table<1>> = Crc::<u16, Table<1>>::new(&CRC_16_IBM_SDLC);
pub const X25_NOLOOKUP: Crc<u16, NoTable> = Crc::<u16, NoTable>::new(&CRC_16_IBM_SDLC);
pub const X25_CLMUL: Crc<u16, Clmul> = Crc::<u16, Clmul>::new(&CRC_16_IBM_SDLC);
pub const ISCSI: Crc<u32> = Crc::<u32>::new(&CRC_32_ISCSI);
pub const ISCSI_SLICE16: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&CRC_32_ISCSI);
pub const ISCSI_BYTEWISE: Crc<u32, Table<1>> = Crc::<u32, Table<1>>::new(&CRC_32_ISCSI);
pub const ISCSI_NOLOOKUP: Crc<u32, NoTable> = Crc::<u32, NoTable>::new(&CRC_32_ISCSI);
pub const ISCSI_CLMUL: Crc<u32, Clmul> = Crc::<u32, Clmul>::new(&CRC_32_ISCSI);
pub const GSM_40: Crc<u64> = Crc::<u64>::new(&CRC_40_GSM);
pub const ECMA: Crc<u64> = Crc::<u64>::new(&CRC_64_ECMA_182);
pub const ECMA_SLICE16: Crc<u64, Table<16>> = Crc::<u64, Table<16>>::new(&CRC_64_ECMA_182);
Expand Down Expand Up @@ -51,6 +54,9 @@ fn checksum(c: &mut Criterion) {
})
.bench_function("slice16", |b| {
b.iter(|| BLUETOOTH_SLICE16.checksum(black_box(&bytes)))
})
.bench_function("clmul", |b| {
b.iter(|| BLUETOOTH_CLMUL.checksum(black_box(&bytes)))
});

c.benchmark_group("crc16")
Expand All @@ -64,6 +70,9 @@ fn checksum(c: &mut Criterion) {
})
.bench_function("slice16", |b| {
b.iter(|| X25_SLICE16.checksum(black_box(&bytes)))
})
.bench_function("clmul", |b| {
b.iter(|| X25_CLMUL.checksum(black_box(&bytes)))
});

c.benchmark_group("crc32")
Expand All @@ -77,6 +86,9 @@ fn checksum(c: &mut Criterion) {
})
.bench_function("slice16", |b| {
b.iter(|| ISCSI_SLICE16.checksum(black_box(&bytes)))
})
.bench_function("clmul", |b| {
b.iter(|| ISCSI_CLMUL.checksum(black_box(&bytes)))
});

c.benchmark_group("crc64")
Expand Down
112 changes: 112 additions & 0 deletions src/clmul.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#[cfg(all(
target_feature = "sse2",
target_feature = "sse4.1",
target_feature = "pclmulqdq",
))]
mod x86;

#[cfg(all(
target_feature = "sse2",
target_feature = "sse4.1",
target_feature = "pclmulqdq",
))]
pub(crate) use x86::Value;

trait ValueOps {
fn xor(self, value: u64) -> Self;

fn fold_16(self, x_mod_p: Self, value: Self) -> Self;

fn fold_8(self, x_mod_p: Self) -> Self;

fn fold_4(self, x_mod_p: Self) -> Self;

fn barret_reduction_32(self, px_u: Self) -> u32;
}

pub(crate) const fn crc32_clmul_coeff(width: u8, poly: u32) -> [Value; 4] {
const fn xt_mod_px(mut t: u32, px: u64) -> u64 {
if t < 32 {
return 0;
}
t -= 31;

let mut n = 0x80000000;
let mut i = 0;
while i < t {
n <<= 1;
if n & 0x100000000 != 0 {
n ^= px;
}
i += 1;
}
n << 32
}

const fn u(px: u64) -> u64 {
let mut q = 0;
let mut n = 0x100000000;
let mut i = 0;
while i < 33 {
q <<= 1;
if n & 0x100000000 != 0 {
q |= 1;
n ^= px;
}
n <<= 1;
i += 1;
}
q
}

let px = (poly as u64) << (u32::BITS as u8 - width);
unsafe {
// SAFETY: This will be evaluated during compile-time and therefore the alignment
// doesn't matter, the type is transmuted from 2*u64 to u64x2 simd type.
core::mem::transmute([
xt_mod_px(4 * 128 + 32, px).reverse_bits() << 1,
xt_mod_px(4 * 128 - 32, px).reverse_bits() << 1,
xt_mod_px(128 + 32, px).reverse_bits() << 1,
xt_mod_px(128 - 32, px).reverse_bits() << 1,
xt_mod_px(64, px).reverse_bits() << 1,
xt_mod_px(32, px).reverse_bits() << 1,
px.reverse_bits() >> 31,
u(px).reverse_bits() >> 31,
])
}
}

pub(crate) fn crc32_update_refin(
crc: u32,
coefficients: &[Value; 4],
first_chunk: &[Value; 4],
chunks: &[[Value; 4]],
) -> u32 {
let mut x4 = *first_chunk;

// Apply initial crc value
x4[0] = x4[0].xor(crc as u64);

// Iteratively Fold by 4:
let k1_k2 = coefficients[0];
for chunk in chunks {
for (x, value) in x4.iter_mut().zip(chunk.iter()) {
*x = x.fold_16(k1_k2, *value)
}
}

// Iteratively Fold by 1:
let k3_k4 = coefficients[1];
let mut x = x4[0].fold_16(k3_k4, x4[1]);
x = x.fold_16(k3_k4, x4[2]);
x = x.fold_16(k3_k4, x4[3]);

// Final Reduction of 128-bits
let k5_k6 = coefficients[2];
x = x.fold_8(k3_k4);
x = x.fold_4(k5_k6);

// Barret Reduction
let px_u = coefficients[3];
x.barret_reduction_32(px_u)
}
78 changes: 78 additions & 0 deletions src/clmul/x86.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
use crate::clmul::ValueOps;

#[cfg(target_arch = "x86")]
use core::arch::x86 as arch;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64 as arch;
use core::mem;

#[derive(Copy, Clone)]
pub struct Value(arch::__m128i);

impl ValueOps for Value {
#[inline]
fn xor(self, value: u64) -> Self {
// SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq
unsafe {
Self(arch::_mm_xor_si128(
self.0,
arch::_mm_set_epi64x(0, value as i64),
))
}
}

#[inline]
fn fold_16(self, x_mod_p: Self, value: Self) -> Self {
// SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq
unsafe {
Self(arch::_mm_xor_si128(
arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x00),
arch::_mm_xor_si128(arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x11), value.0),
))
}
}

#[inline]
fn fold_8(self, x_mod_p: Self) -> Self {
// SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq
unsafe {
Self(arch::_mm_xor_si128(
arch::_mm_clmulepi64_si128(self.0, x_mod_p.0, 0x10),
arch::_mm_srli_si128(self.0, 8),
))
}
}

#[inline]
fn fold_4(self, x_mod_p: Self) -> Self {
// SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq
unsafe {
Self(arch::_mm_xor_si128(
arch::_mm_clmulepi64_si128(
arch::_mm_and_si128(self.0, mem::transmute((1u128 << 32) - 1)),
x_mod_p.0,
0x00,
),
arch::_mm_srli_si128(self.0, 4),
))
}
}

#[inline]
fn barret_reduction_32(self, px_u: Self) -> u32 {
// SAFETY: This is only implemented if the target supports sse2, sse4.1, and pclmulqdq
unsafe {
let t1 = arch::_mm_clmulepi64_si128(
arch::_mm_and_si128(self.0, mem::transmute((1u128 << 32) - 1)),
px_u.0,
0x10,
);
let t2 = arch::_mm_clmulepi64_si128(
arch::_mm_and_si128(t1, mem::transmute((1u128 << 32) - 1)),
px_u.0,
0x00,
);
arch::_mm_extract_epi32(arch::_mm_xor_si128(self.0, t2), 1) as u32
}
}
}
27 changes: 17 additions & 10 deletions src/crc128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,20 +169,19 @@ const fn update_slice16(
#[cfg(test)]
mod test {
use crate::*;
use crc_catalog::{Algorithm, CRC_82_DARC};

/// Test this optimized version against the well known implementation to ensure correctness
#[test]
fn correctness() {
let data: &[&str] = &[
"",
"1",
"1234",
"123456789",
"0123456789ABCDE",
"01234567890ABCDEFGHIJK",
"01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK",
];
"",
"1",
"1234",
"123456789",
"0123456789ABCDE",
"01234567890ABCDEFGHIJK",
"01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK01234567890ABCDEFGHIJK",
];

pub const CRC_82_DARC_NONREFLEX: Algorithm<u128> = Algorithm {
width: 82,
Expand All @@ -191,12 +190,20 @@ mod test {
refin: false,
refout: true,
xorout: 0x000000000000000000000,
check: 0x09ea83f625023801fd612,
check: 0x12e0b19fa447c0bf627ac,
residue: 0x000000000000000000000,
};

let algs_to_test = [&CRC_82_DARC, &CRC_82_DARC_NONREFLEX];

// Check if the baseline is as expected.
for alg in algs_to_test {
assert_eq!(
Crc::<u128, Table<1>>::new(alg).checksum("123456789".as_bytes()),
alg.check
);
}

for alg in algs_to_test {
for data in data {
let crc_slice16 = Crc::<u128, Table<16>>::new(alg);
Expand Down
Loading

0 comments on commit 79e1c40

Please sign in to comment.