Skip to content

Commit c8d82bd

Browse files
add sse2 chaining trait
Signed-off-by: eternal-flame-AD <[email protected]>
1 parent 23f496d commit c8d82bd

File tree

6 files changed

+291
-24
lines changed

6 files changed

+291
-24
lines changed

salsa20/benches/mod.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
#![feature(test)]
2+
3+
use cipher::{
4+
Array,
5+
consts::{U1, U4, U64},
6+
};
27
extern crate test;
38

49
cipher::stream_cipher_bench!(
510
salsa20::Salsa8;
611
salsa8_bench1_16b 16;
12+
salsa8_bench1_64b 64;
713
salsa8_bench2_256b 256;
814
salsa8_bench3_1kib 1024;
915
salsa8_bench4_16kib 16384;
@@ -12,6 +18,7 @@ cipher::stream_cipher_bench!(
1218
cipher::stream_cipher_bench!(
1319
salsa20::Salsa12;
1420
salsa12_bench1_16b 16;
21+
salsa12_bench1_64b 64;
1522
salsa12_bench2_256b 256;
1623
salsa12_bench3_1kib 1024;
1724
salsa12_bench4_16kib 16384;
@@ -20,7 +27,50 @@ cipher::stream_cipher_bench!(
2027
cipher::stream_cipher_bench!(
2128
salsa20::Salsa20;
2229
salsa20_bench1_16b 16;
30+
salsa20_bench1_64b 64;
2331
salsa20_bench2_256b 256;
2432
salsa20_bench3_1kib 1024;
2533
salsa20_bench4_16kib 16384;
2634
);
35+
36+
#[bench]
37+
fn salsa8_bench1_ks_altn(b: &mut test::Bencher) {
38+
use salsa20::SalsaChaining;
39+
use std::hash::{BuildHasher, Hasher};
40+
41+
let seed = std::hash::RandomState::new().build_hasher().finish();
42+
43+
let mut buf: Array<[u32; 16], U1> = [[0u32; 16]].into();
44+
buf[0][0] = seed as u32;
45+
buf[0][1] = (seed >> 32) as u32;
46+
47+
b.iter(|| {
48+
let mut cipher = salsa20::SalsaCore::<U4>::from_raw_state_cv(buf);
49+
cipher.write_keystream_block_cv([&mut buf[0]].into());
50+
test::black_box(&buf);
51+
});
52+
53+
b.bytes = buf[0].len() as u64 * core::mem::size_of::<u32>() as u64;
54+
}
55+
56+
#[bench]
57+
fn salsa8_bench1_ks(b: &mut test::Bencher) {
58+
use cipher::StreamCipherCore;
59+
use std::hash::{BuildHasher, Hasher};
60+
61+
let seed = std::hash::RandomState::new().build_hasher().finish();
62+
63+
let mut buf = [0u32; 16];
64+
buf[0] = seed as u32;
65+
buf[1] = (seed >> 32) as u32;
66+
67+
b.iter(|| {
68+
let mut cipher = salsa20::SalsaCore::<U4>::from_raw_state(buf);
69+
cipher.write_keystream_block(unsafe {
70+
core::mem::transmute::<&mut [u32; 16], &mut Array<u8, U64>>(&mut buf)
71+
});
72+
test::black_box(&buf);
73+
});
74+
75+
b.bytes = buf.len() as u64 * core::mem::size_of::<u32>() as u64;
76+
}

salsa20/src/backends.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,30 @@
1-
pub(crate) mod soft;
1+
use cfg_if::cfg_if;
2+
3+
cfg_if! {
4+
if #[cfg(all(target_feature = "sse2", any(target_arch = "x86", target_arch = "x86_64")))] {
5+
pub(crate) mod sse2;
6+
pub(crate) type Backend<'a, R> = sse2::Backend<'a, R>;
7+
} else {
8+
pub(crate) mod soft;
9+
pub(crate) type Backend<'a, R> = soft::Backend<'a, R>;
10+
}
11+
}
12+
13+
#[inline]
14+
#[allow(clippy::many_single_char_names)]
15+
pub(crate) fn quarter_round(
16+
a: usize,
17+
b: usize,
18+
c: usize,
19+
d: usize,
20+
state: &mut [u32; crate::STATE_WORDS],
21+
) {
22+
let a = crate::DATA_LAYOUT_INVERSE[a];
23+
let b = crate::DATA_LAYOUT_INVERSE[b];
24+
let c = crate::DATA_LAYOUT_INVERSE[c];
25+
let d = crate::DATA_LAYOUT_INVERSE[d];
26+
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
27+
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
28+
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);
29+
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18);
30+
}

salsa20/src/backends/soft.rs

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,16 @@ use cipher::{
77
consts::{U1, U64},
88
};
99

10+
use super::quarter_round;
11+
1012
pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>);
1113

14+
impl<'a, R: Unsigned> From<&'a mut SalsaCore<R>> for Backend<'a, R> {
15+
fn from(core: &'a mut SalsaCore<R>) -> Self {
16+
Backend(core)
17+
}
18+
}
19+
1220
impl<R: Unsigned> BlockSizeUser for Backend<'_, R> {
1321
type BlockSize = U64;
1422
}
@@ -17,6 +25,17 @@ impl<R: Unsigned> ParBlocksSizeUser for Backend<'_, R> {
1725
type ParBlocksSize = U1;
1826
}
1927

28+
impl<R: Unsigned> Backend<'_, R> {
29+
#[inline(always)]
30+
pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) {
31+
let res = run_rounds::<R>(&self.0.state);
32+
33+
self.0.set_block_pos(self.0.get_block_pos() + 1);
34+
35+
block.copy_from_slice(&res);
36+
}
37+
}
38+
2039
impl<R: Unsigned> StreamCipherBackend for Backend<'_, R> {
2140
#[inline(always)]
2241
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
@@ -31,25 +50,6 @@ impl<R: Unsigned> StreamCipherBackend for Backend<'_, R> {
3150
}
3251
}
3352

34-
#[inline]
35-
#[allow(clippy::many_single_char_names)]
36-
pub(crate) fn quarter_round(
37-
a: usize,
38-
b: usize,
39-
c: usize,
40-
d: usize,
41-
state: &mut [u32; STATE_WORDS],
42-
) {
43-
let a = crate::DATA_LAYOUT_INVERSE[a];
44-
let b = crate::DATA_LAYOUT_INVERSE[b];
45-
let c = crate::DATA_LAYOUT_INVERSE[c];
46-
let d = crate::DATA_LAYOUT_INVERSE[d];
47-
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
48-
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
49-
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);
50-
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18);
51-
}
52-
5353
#[inline(always)]
5454
fn run_rounds<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] {
5555
let mut res = *state;

salsa20/src/backends/sse2.rs

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
//! SSE2 backend for Salsa20.
2+
3+
use crate::{Block, STATE_WORDS, SalsaCore, Unsigned};
4+
use cipher::{
5+
BlockSizeUser, ParBlocksSizeUser, StreamCipherBackend, StreamCipherSeekCore,
6+
consts::{U1, U64},
7+
};
8+
9+
pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>);
10+
11+
impl<'a, R: Unsigned> From<&'a mut SalsaCore<R>> for Backend<'a, R> {
12+
fn from(core: &'a mut SalsaCore<R>) -> Self {
13+
Backend(core)
14+
}
15+
}
16+
17+
impl<R: Unsigned> BlockSizeUser for Backend<'_, R> {
18+
type BlockSize = U64;
19+
}
20+
21+
impl<R: Unsigned> ParBlocksSizeUser for Backend<'_, R> {
22+
type ParBlocksSize = U1;
23+
}
24+
25+
impl<R: Unsigned> Backend<'_, R> {
26+
#[inline(always)]
27+
pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) {
28+
let res = run_rounds_sse2::<R>(&self.0.state);
29+
30+
self.0.set_block_pos(self.0.get_block_pos() + 1);
31+
32+
block.copy_from_slice(&res);
33+
}
34+
}
35+
36+
impl<R: Unsigned> StreamCipherBackend for Backend<'_, R> {
37+
#[inline(always)]
38+
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
39+
let res = run_rounds_sse2::<R>(&self.0.state);
40+
41+
self.0.set_block_pos(self.0.get_block_pos() + 1);
42+
43+
for i in 0..16 {
44+
block[i * 4..(i + 1) * 4]
45+
.copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes());
46+
}
47+
}
48+
}
49+
50+
#[inline(always)]
51+
fn run_rounds_sse2<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] {
52+
use core::arch::x86_64::*;
53+
unsafe {
54+
let [a_save, b_save, d_save, c_save] = [
55+
_mm_load_si128(state.as_ptr().add(0).cast()),
56+
_mm_load_si128(state.as_ptr().add(4).cast()),
57+
_mm_load_si128(state.as_ptr().add(8).cast()),
58+
_mm_load_si128(state.as_ptr().add(12).cast()),
59+
];
60+
let [mut a, mut b, mut c, mut d] = [a_save, b_save, c_save, d_save];
61+
62+
macro_rules! mm_rol_epi32x {
63+
($w:expr, $amt:literal) => {{
64+
let w = $w;
65+
_mm_xor_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt))
66+
}};
67+
}
68+
69+
macro_rules! quarter_xmmwords {
70+
($a:expr, $b:expr, $c:expr, $d:expr) => {
71+
$b = _mm_xor_si128($b, mm_rol_epi32x!(_mm_add_epi32($a, $d), 7));
72+
$c = _mm_xor_si128($c, mm_rol_epi32x!(_mm_add_epi32($b, $a), 9));
73+
$d = _mm_xor_si128($d, mm_rol_epi32x!(_mm_add_epi32($c, $b), 13));
74+
$a = _mm_xor_si128($a, mm_rol_epi32x!(_mm_add_epi32($d, $c), 18));
75+
};
76+
}
77+
78+
for _ in 0..R::USIZE {
79+
quarter_xmmwords!(a, b, c, d);
80+
81+
// a stays in place
82+
// b = left shuffle d by 1 element
83+
d = _mm_shuffle_epi32(d, 0b00_11_10_01);
84+
// c = left shuffle c by 2 elements
85+
c = _mm_shuffle_epi32(c, 0b01_00_11_10);
86+
// d = left shuffle b by 3 elements
87+
b = _mm_shuffle_epi32(b, 0b10_01_00_11);
88+
89+
(b, d) = (d, b);
90+
91+
quarter_xmmwords!(a, b, c, d);
92+
93+
// a stays in place
94+
// b = left shuffle d by 1 element
95+
d = _mm_shuffle_epi32(d, 0b00_11_10_01);
96+
// c = left shuffle c by 2 elements
97+
c = _mm_shuffle_epi32(c, 0b01_00_11_10);
98+
// d = left shuffle b by 3 elements
99+
b = _mm_shuffle_epi32(b, 0b10_01_00_11);
100+
101+
(b, d) = (d, b);
102+
}
103+
104+
let mut res = [0u32; STATE_WORDS];
105+
_mm_storeu_si128(res.as_mut_ptr().add(0).cast(), _mm_add_epi32(a, a_save));
106+
_mm_storeu_si128(res.as_mut_ptr().add(4).cast(), _mm_add_epi32(b, b_save));
107+
_mm_storeu_si128(res.as_mut_ptr().add(8).cast(), _mm_add_epi32(d, d_save));
108+
_mm_storeu_si128(res.as_mut_ptr().add(12).cast(), _mm_add_epi32(c, c_save));
109+
res
110+
}
111+
}

salsa20/src/lib.rs

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ pub use cipher;
7878
use cipher::{
7979
Block, BlockSizeUser, IvSizeUser, KeyIvInit, KeySizeUser, StreamCipherClosure,
8080
StreamCipherCore, StreamCipherCoreWrapper, StreamCipherSeekCore,
81-
array::{Array, typenum::Unsigned},
82-
consts::{U4, U6, U8, U10, U24, U32, U64},
81+
array::{Array, ArraySize, typenum::Unsigned},
82+
consts::{U1, U4, U6, U8, U10, U24, U32, U64},
8383
};
8484
use core::marker::PhantomData;
8585

@@ -138,13 +138,62 @@ const DATA_LAYOUT_INVERSE: [usize; 16] = {
138138
};
139139

140140
/// The Salsa20 core function.
141+
#[repr(C)]
142+
#[repr(align(16))]
141143
pub struct SalsaCore<R: Unsigned> {
142144
/// Internal state of the core function
143145
state: [u32; STATE_WORDS],
144146
/// Number of rounds to perform
145147
rounds: PhantomData<R>,
146148
}
147149

150+
#[expect(unused)]
151+
const STATIC_ASSERT_CORE_IS_64_BYTES: [(); size_of::<SalsaCore<U10>>()] = [(); 64];
152+
153+
/// Salsa20 chaining operations.
154+
pub trait SalsaChaining: BlockSizeUser<BlockSize = U64> {
155+
/// Number of lanes
156+
type LaneCount: ArraySize;
157+
158+
/// Permutation table for shuffling the natural order state into the internal order.
159+
const ALTN_DATA_LAYOUT: [usize; STATE_WORDS];
160+
161+
/// Shuffle the state into the internal data layout.
162+
fn shuffle_state_into_altn(state: &mut [u32; STATE_WORDS]) {
163+
for i in 0..STATE_WORDS {
164+
state[i] = state[Self::ALTN_DATA_LAYOUT[i]];
165+
}
166+
}
167+
168+
/// Shuffle the state from the internal data layout.
169+
fn shuffle_state_from_altn(state: &mut [u32; STATE_WORDS]) {
170+
const INVERSE_ALTN_DATA_LAYOUT: [usize; STATE_WORDS] = {
171+
let mut index = [0; 16];
172+
let mut i = 0;
173+
while i < 16 {
174+
let mut inverse = 0;
175+
while inverse < 16 {
176+
if DATA_LAYOUT[inverse] == i {
177+
index[i] = inverse;
178+
break;
179+
}
180+
inverse += 1;
181+
}
182+
i += 1;
183+
}
184+
index
185+
};
186+
for i in 0..STATE_WORDS {
187+
state[i] = state[INVERSE_ALTN_DATA_LAYOUT[i]];
188+
}
189+
}
190+
191+
/// Instantiate new Salsa core from raw state in internal order.
192+
fn from_raw_state_cv(state: Array<[u32; STATE_WORDS], Self::LaneCount>) -> Self;
193+
/// Generate keystream block in internal order.
194+
fn write_keystream_block_cv(&mut self, block: Array<&mut [u32; STATE_WORDS], Self::LaneCount>);
195+
}
196+
148197
impl<R: Unsigned> SalsaCore<R> {
149198
/// Create new Salsa core from raw state.
150199
///
@@ -158,6 +207,34 @@ impl<R: Unsigned> SalsaCore<R> {
158207
}
159208
}
160209

210+
impl<R: Unsigned> SalsaChaining for SalsaCore<R> {
211+
type LaneCount = U1;
212+
213+
const ALTN_DATA_LAYOUT: [usize; STATE_WORDS] = DATA_LAYOUT;
214+
215+
/// Create new Salsa core from raw state with alternative data layout.
216+
///
217+
/// This method is mainly intended for the `scrypt` crate.
218+
/// Other users generally should not use this method.
219+
fn from_raw_state_cv(state: Array<[u32; STATE_WORDS], Self::LaneCount>) -> Self {
220+
Self {
221+
state: state[0],
222+
rounds: PhantomData,
223+
}
224+
}
225+
226+
/// Generate keystream block with alternative data layout.
227+
///
228+
/// This method is used to generate keystream blocks with alternative data layout.
229+
fn write_keystream_block_cv(
230+
&mut self,
231+
mut block: Array<&mut [u32; STATE_WORDS], Self::LaneCount>,
232+
) {
233+
let mut backend = backends::Backend::<'_, R>::from(self);
234+
backend.gen_ks_block_altn(&mut block[0]);
235+
}
236+
}
237+
161238
impl<R: Unsigned> KeySizeUser for SalsaCore<R> {
162239
type KeySize = U32;
163240
}
@@ -209,7 +286,7 @@ impl<R: Unsigned> StreamCipherCore for SalsaCore<R> {
209286
rem.try_into().ok()
210287
}
211288
fn process_with_backend(&mut self, f: impl StreamCipherClosure<BlockSize = Self::BlockSize>) {
212-
f.call(&mut backends::soft::Backend(self));
289+
f.call(&mut backends::Backend::<'_, R>::from(self));
213290
}
214291
}
215292

0 commit comments

Comments
 (0)