diff --git a/polynomials-avx2/src/serialize.rs b/polynomials-avx2/src/serialize.rs index caf6ae565..f2e544b9b 100644 --- a/polynomials-avx2/src/serialize.rs +++ b/polynomials-avx2/src/serialize.rs @@ -4,13 +4,29 @@ use crate::{portable, SIMD256Vector}; #[inline(always)] pub(crate) fn serialize_1(vector: __m256i) -> [u8; 2] { - let lsb_shifted_up = mm256_slli_epi16::<15>(vector); - - let low_lanes = mm256_castsi256_si128(lsb_shifted_up); - let high_lanes = mm256_extracti128_si256::<1>(lsb_shifted_up); - - let msbs = mm_packs_epi16(low_lanes, high_lanes); - + // We care only about the least significant bit in each lane, + // move it to the most significant position to make it easier to work with. + let lsb_to_msb = mm256_slli_epi16::<15>(vector); + + // Get the first 8 16-bit elements ... + let low_msbs = mm256_castsi256_si128(lsb_to_msb); + + // ... and the next 8 16-bit elements ... + let high_msbs = mm256_extracti128_si256::<1>(lsb_to_msb); + + // ... and then pack them into 8-bit values using signed saturation. + // This function packs all the |low_msbs|, and then the high ones. + // + // We shifted by 15 above to take advantage of signed saturation: + // + // - if the sign bit of the 16-bit element being packed is 1, the + // corresponding 8-bit element in |msbs| will be 0xFF. + // - if the sign bit of the 16-bit element being packed is 0, the + // corresponding 8-bit element in |msbs| will be 0. + let msbs = mm_packs_epi16(low_msbs, high_msbs); + + // Now that we have all 16 bits we need conveniently placed in one vector, + // extract them into two bytes. let bits_packed = mm_movemask_epi8(msbs); let mut serialized = [0u8; 2]; @@ -22,25 +38,19 @@ pub(crate) fn serialize_1(vector: __m256i) -> [u8; 2] { #[inline(always)] pub(crate) fn deserialize_1(bytes: &[u8]) -> __m256i { - let shift_lsb_to_msb = mm256_set_epi16( - 1 << 0, - 1 << 1, - 1 << 2, - 1 << 3, - 1 << 4, - 1 << 5, - 1 << 6, - 1 << 7, - 1 << 0, - 1 << 1, - 1 << 2, - 1 << 3, - 1 << 4, - 1 << 5, - 1 << 6, - 1 << 7, - ); - + // We need to take each bit from the 2 bytes of input and put them + // into their own 16-bit lane. Ideally, we'd load the two bytes into the vector, + // duplicate them, and right-shift the 0th element by 0 bits, + // the first element by 1 bit, the second by 2 bits and so on before AND-ing + // with 0x1 to leave only the least signifinicant bit. + // But |_mm256_srlv_epi16| does not exist unfortunately, so we have to resort + // to a workaround. + // + // Rather than shifting each element by a different amount, we'll multiply + // each element by a value such that the bit we're interested in becomes the most + // significant bit. + + // The coefficients are loaded as follows: let coefficients = mm256_set_epi16( bytes[1] as i16, bytes[1] as i16, @@ -60,16 +70,46 @@ pub(crate) fn deserialize_1(bytes: &[u8]) -> __m256i { bytes[0] as i16, ); + // And this vector, when multiplied with the previous one, ensures that the + // bit we'd like to keep in each lane becomes the most significant bit upon + // multiplication. + let shift_lsb_to_msb = mm256_set_epi16( + 1 << 8, + 1 << 9, + 1 << 10, + 1 << 11, + 1 << 12, + 1 << 13, + 1 << 14, + 1 << 15, + 1 << 8, + 1 << 9, + 1 << 10, + 1 << 11, + 1 << 12, + 1 << 13, + 1 << 14, + 1 << 15, + ); let coefficients_in_msb = mm256_mullo_epi16(coefficients, shift_lsb_to_msb); - let coefficients_in_lsb = mm256_srli_epi16::<7>(coefficients_in_msb); - mm256_and_si256(coefficients_in_lsb, mm256_set1_epi16((1 << 1) - 1)) + // Now that they're all in the most significant bit position, shift them + // down to the least significant bit. + mm256_srli_epi16::<15>(coefficients_in_msb) } #[inline(always)] pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] { let mut serialized = [0u8; 16]; + // If |vector| is laid out as follows: + // + // 0x000A 0x000B 0x000C 0x000D | 0x000E 0x000F 0x000G 0x000H | .... + // + // |adjacent_2_combined| will be laid out as a series of 32-bit integeres, + // as follows: + // + // 0x00_00_00_BA 0x00_00_00_DC | 0x00_00_00_FE 0x00_00_00_HG | ... let adjacent_2_combined = mm256_madd_epi16( vector, mm256_set_epi16( @@ -92,6 +132,12 @@ pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] { ), ); + // Recall that |adjacent_2_combined| goes as follows: + // + // 0x00_00_00_BA 0x00_00_00_DC | 0x00_00_00_FE 0x00_00_00_HG | ... + // + // Out of this, we only need the first byte, the 4th byte, the 8th byte + // and so on from the bottom and the top 128 bits. let adjacent_8_combined = mm256_shuffle_epi8( adjacent_2_combined, mm256_set_epi8( @@ -100,10 +146,16 @@ pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] { ), ); + // |adjacent_8_combined| looks like this: + // + // 0: 0xHG_FE_DC_BA 1: 0x00_00_00_00 | 2: 0x00_00_00_00 3: 0x00_00_00_00 | 4: 0xPO_NM_LK_JI .... + // + // We put the element at 4 after the element at 0 ... let combined = mm256_permutevar8x32_epi32(adjacent_8_combined, mm256_set_epi32(0, 0, 0, 0, 0, 0, 4, 0)); let combined = mm256_castsi256_si128(combined); + // ... so that we can read them out in one go. mm_storeu_bytes_si128(&mut serialized[..], combined); serialized[0..8].try_into().unwrap()