Skip to content

Commit

Permalink
Comments in serialize.rs
Browse files Browse the repository at this point in the history
  • Loading branch information
xvzcf committed May 17, 2024
1 parent 494709f commit 6e2b44d
Showing 1 changed file with 80 additions and 28 deletions.
108 changes: 80 additions & 28 deletions polynomials-avx2/src/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,29 @@ use crate::{portable, SIMD256Vector};

#[inline(always)]
pub(crate) fn serialize_1(vector: __m256i) -> [u8; 2] {
let lsb_shifted_up = mm256_slli_epi16::<15>(vector);

let low_lanes = mm256_castsi256_si128(lsb_shifted_up);
let high_lanes = mm256_extracti128_si256::<1>(lsb_shifted_up);

let msbs = mm_packs_epi16(low_lanes, high_lanes);

// We care only about the least significant bit in each lane,
// move it to the most significant position to make it easier to work with.
let lsb_to_msb = mm256_slli_epi16::<15>(vector);

// Get the first 8 16-bit elements ...
let low_msbs = mm256_castsi256_si128(lsb_to_msb);

// ... and the next 8 16-bit elements ...
let high_msbs = mm256_extracti128_si256::<1>(lsb_to_msb);

// ... and then pack them into 8-bit values using signed saturation.
// This function packs all the |low_msbs|, and then the high ones.
//
// We shifted by 15 above to take advantage of signed saturation:
//
// - if the sign bit of the 16-bit element being packed is 1, the
// corresponding 8-bit element in |msbs| will be 0xFF.
// - if the sign bit of the 16-bit element being packed is 0, the
// corresponding 8-bit element in |msbs| will be 0.
let msbs = mm_packs_epi16(low_msbs, high_msbs);

// Now that we have all 16 bits we need conveniently placed in one vector,
// extract them into two bytes.
let bits_packed = mm_movemask_epi8(msbs);

let mut serialized = [0u8; 2];
Expand All @@ -22,25 +38,19 @@ pub(crate) fn serialize_1(vector: __m256i) -> [u8; 2] {

#[inline(always)]
pub(crate) fn deserialize_1(bytes: &[u8]) -> __m256i {
let shift_lsb_to_msb = mm256_set_epi16(
1 << 0,
1 << 1,
1 << 2,
1 << 3,
1 << 4,
1 << 5,
1 << 6,
1 << 7,
1 << 0,
1 << 1,
1 << 2,
1 << 3,
1 << 4,
1 << 5,
1 << 6,
1 << 7,
);

// We need to take each bit from the 2 bytes of input and put them
// into their own 16-bit lane. Ideally, we'd load the two bytes into the vector,
// duplicate them, and right-shift the 0th element by 0 bits,
// the first element by 1 bit, the second by 2 bits and so on before AND-ing
// with 0x1 to leave only the least signifinicant bit.
// But |_mm256_srlv_epi16| does not exist unfortunately, so we have to resort
// to a workaround.
//
// Rather than shifting each element by a different amount, we'll multiply
// each element by a value such that the bit we're interested in becomes the most
// significant bit.

// The coefficients are loaded as follows:
let coefficients = mm256_set_epi16(
bytes[1] as i16,
bytes[1] as i16,
Expand All @@ -60,16 +70,46 @@ pub(crate) fn deserialize_1(bytes: &[u8]) -> __m256i {
bytes[0] as i16,
);

// And this vector, when multiplied with the previous one, ensures that the
// bit we'd like to keep in each lane becomes the most significant bit upon
// multiplication.
let shift_lsb_to_msb = mm256_set_epi16(
1 << 8,
1 << 9,
1 << 10,
1 << 11,
1 << 12,
1 << 13,
1 << 14,
1 << 15,
1 << 8,
1 << 9,
1 << 10,
1 << 11,
1 << 12,
1 << 13,
1 << 14,
1 << 15,
);
let coefficients_in_msb = mm256_mullo_epi16(coefficients, shift_lsb_to_msb);
let coefficients_in_lsb = mm256_srli_epi16::<7>(coefficients_in_msb);

mm256_and_si256(coefficients_in_lsb, mm256_set1_epi16((1 << 1) - 1))
// Now that they're all in the most significant bit position, shift them
// down to the least significant bit.
mm256_srli_epi16::<15>(coefficients_in_msb)
}

#[inline(always)]
pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] {
let mut serialized = [0u8; 16];

// If |vector| is laid out as follows:
//
// 0x000A 0x000B 0x000C 0x000D | 0x000E 0x000F 0x000G 0x000H | ....
//
// |adjacent_2_combined| will be laid out as a series of 32-bit integeres,
// as follows:
//
// 0x00_00_00_BA 0x00_00_00_DC | 0x00_00_00_FE 0x00_00_00_HG | ...
let adjacent_2_combined = mm256_madd_epi16(
vector,
mm256_set_epi16(
Expand All @@ -92,6 +132,12 @@ pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] {
),
);

// Recall that |adjacent_2_combined| goes as follows:
//
// 0x00_00_00_BA 0x00_00_00_DC | 0x00_00_00_FE 0x00_00_00_HG | ...
//
// Out of this, we only need the first byte, the 4th byte, the 8th byte
// and so on from the bottom and the top 128 bits.
let adjacent_8_combined = mm256_shuffle_epi8(
adjacent_2_combined,
mm256_set_epi8(
Expand All @@ -100,10 +146,16 @@ pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] {
),
);

// |adjacent_8_combined| looks like this:
//
// 0: 0xHG_FE_DC_BA 1: 0x00_00_00_00 | 2: 0x00_00_00_00 3: 0x00_00_00_00 | 4: 0xPO_NM_LK_JI ....
//
// We put the element at 4 after the element at 0 ...
let combined =
mm256_permutevar8x32_epi32(adjacent_8_combined, mm256_set_epi32(0, 0, 0, 0, 0, 0, 4, 0));
let combined = mm256_castsi256_si128(combined);

// ... so that we can read them out in one go.
mm_storeu_bytes_si128(&mut serialized[..], combined);

serialized[0..8].try_into().unwrap()
Expand Down

0 comments on commit 6e2b44d

Please sign in to comment.