Comments in serialize.rs

cryspen · May 17, 2024 · 6e2b44d · 6e2b44d
1 parent 494709f
commit 6e2b44d
Showing 1 changed file with 80 additions and 28 deletions.
diff --git a/polynomials-avx2/src/serialize.rs b/polynomials-avx2/src/serialize.rs
@@ -4,13 +4,29 @@ use crate::{portable, SIMD256Vector};
 
 #[inline(always)]
 pub(crate) fn serialize_1(vector: __m256i) -> [u8; 2] {
-    let lsb_shifted_up = mm256_slli_epi16::<15>(vector);
-
-    let low_lanes = mm256_castsi256_si128(lsb_shifted_up);
-    let high_lanes = mm256_extracti128_si256::<1>(lsb_shifted_up);
-
-    let msbs = mm_packs_epi16(low_lanes, high_lanes);
-
+    // We care only about the least significant bit in each lane,
+    // move it to the most significant position to make it easier to work with.
+    let lsb_to_msb = mm256_slli_epi16::<15>(vector);
+
+    // Get the first 8 16-bit elements ...
+    let low_msbs = mm256_castsi256_si128(lsb_to_msb);
+
+    // ... and the next 8 16-bit elements ...
+    let high_msbs = mm256_extracti128_si256::<1>(lsb_to_msb);
+
+    // ... and then pack them into 8-bit values using signed saturation.
+    // This function packs all the |low_msbs|, and then the high ones.
+    //
+    // We shifted by 15 above to take advantage of signed saturation:
+    //
+    // - if the sign bit of the 16-bit element being packed is 1, the
+    // corresponding 8-bit element in |msbs| will be 0xFF.
+    // - if the sign bit of the 16-bit element being packed is 0, the
+    // corresponding 8-bit element in |msbs| will be 0.
+    let msbs = mm_packs_epi16(low_msbs, high_msbs);
+
+    // Now that we have all 16 bits we need conveniently placed in one vector,
+    // extract them into two bytes.
     let bits_packed = mm_movemask_epi8(msbs);
 
     let mut serialized = [0u8; 2];
@@ -22,25 +38,19 @@ pub(crate) fn serialize_1(vector: __m256i) -> [u8; 2] {
 
 #[inline(always)]
 pub(crate) fn deserialize_1(bytes: &[u8]) -> __m256i {
-    let shift_lsb_to_msb = mm256_set_epi16(
-        1 << 0,
-        1 << 1,
-        1 << 2,
-        1 << 3,
-        1 << 4,
-        1 << 5,
-        1 << 6,
-        1 << 7,
-        1 << 0,
-        1 << 1,
-        1 << 2,
-        1 << 3,
-        1 << 4,
-        1 << 5,
-        1 << 6,
-        1 << 7,
-    );
-
+    // We need to take each bit from the 2 bytes of input and put them
+    // into their own 16-bit lane. Ideally, we'd load the two bytes into the vector,
+    // duplicate them, and right-shift the 0th element by 0 bits,
+    // the first element by 1 bit, the second by 2 bits and so on before AND-ing
+    // with 0x1 to leave only the least signifinicant bit.
+    // But |_mm256_srlv_epi16| does not exist unfortunately, so we have to resort
+    // to a workaround.
+    //
+    // Rather than shifting each element by a different amount, we'll multiply
+    // each element by a value such that the bit we're interested in becomes the most
+    // significant bit.
+
+    // The coefficients are loaded as follows:
     let coefficients = mm256_set_epi16(
         bytes[1] as i16,
         bytes[1] as i16,
@@ -60,16 +70,46 @@ pub(crate) fn deserialize_1(bytes: &[u8]) -> __m256i {
         bytes[0] as i16,
     );
 
+    // And this vector, when multiplied with the previous one, ensures that the
+    // bit we'd like to keep in each lane becomes the most significant bit upon
+    // multiplication.
+    let shift_lsb_to_msb = mm256_set_epi16(
+        1 << 8,
+        1 << 9,
+        1 << 10,
+        1 << 11,
+        1 << 12,
+        1 << 13,
+        1 << 14,
+        1 << 15,
+        1 << 8,
+        1 << 9,
+        1 << 10,
+        1 << 11,
+        1 << 12,
+        1 << 13,
+        1 << 14,
+        1 << 15,
+    );
     let coefficients_in_msb = mm256_mullo_epi16(coefficients, shift_lsb_to_msb);
-    let coefficients_in_lsb = mm256_srli_epi16::<7>(coefficients_in_msb);
 
-    mm256_and_si256(coefficients_in_lsb, mm256_set1_epi16((1 << 1) - 1))
+    // Now that they're all in the most significant bit position, shift them
+    // down to the least significant bit.
+    mm256_srli_epi16::<15>(coefficients_in_msb)
 }
 
 #[inline(always)]
 pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] {
     let mut serialized = [0u8; 16];
 
+    // If |vector| is laid out as follows:
+    //
+    // 0x000A 0x000B 0x000C 0x000D | 0x000E 0x000F 0x000G 0x000H | ....
+    //
+    // |adjacent_2_combined| will be laid out as a series of 32-bit integeres,
+    // as follows:
+    //
+    // 0x00_00_00_BA 0x00_00_00_DC | 0x00_00_00_FE 0x00_00_00_HG | ...
     let adjacent_2_combined = mm256_madd_epi16(
         vector,
         mm256_set_epi16(
@@ -92,6 +132,12 @@ pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] {
         ),
     );
 
+    // Recall that |adjacent_2_combined| goes as follows:
+    //
+    // 0x00_00_00_BA 0x00_00_00_DC | 0x00_00_00_FE 0x00_00_00_HG | ...
+    //
+    // Out of this, we only need the first byte, the 4th byte, the 8th byte
+    // and so on from the bottom and the top 128 bits.
     let adjacent_8_combined = mm256_shuffle_epi8(
         adjacent_2_combined,
         mm256_set_epi8(
@@ -100,10 +146,16 @@ pub(crate) fn serialize_4(vector: __m256i) -> [u8; 8] {
         ),
     );
 
+    // |adjacent_8_combined| looks like this:
+    //
+    // 0: 0xHG_FE_DC_BA 1: 0x00_00_00_00 | 2: 0x00_00_00_00 3: 0x00_00_00_00 | 4: 0xPO_NM_LK_JI ....
+    //
+    // We put the element at 4 after the element at 0 ...
     let combined =
         mm256_permutevar8x32_epi32(adjacent_8_combined, mm256_set_epi32(0, 0, 0, 0, 0, 0, 4, 0));
     let combined = mm256_castsi256_si128(combined);
 
+    // ... so that we can read them out in one go.
     mm_storeu_bytes_si128(&mut serialized[..], combined);
 
     serialized[0..8].try_into().unwrap()