make calling convention more consistant

Lokathor · Oct 21, 2024 · f5bfefa · f5bfefa
1 parent bd0cd2d
commit f5bfefa
Show file tree

Hide file tree

Showing 8 changed files with 90 additions and 88 deletions.
diff --git a/src/u16x8_.rs b/src/u16x8_.rs
@@ -591,6 +591,44 @@ impl u16x8 {
     }
   }
 
+  /// Multiples two `u16x8` and return the high part of intermediate `u32x8`
+  #[inline]
+  #[must_use]
+  pub fn mul_keep_high(lhs: Self, rhs: Self) -> Self {
+    pick! {
+      if #[cfg(target_feature="sse2")] {
+        Self { sse: mul_u16_keep_high_m128i(lhs.sse, rhs.sse) }
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        let lhs_low = unsafe { vget_low_u16(lhs.neon) };
+        let rhs_low = unsafe { vget_low_u16(rhs.neon) };
+
+        let lhs_high = unsafe { vget_high_u16(lhs.neon) };
+        let rhs_high = unsafe { vget_high_u16(rhs.neon) };
+
+        let low = unsafe { vmull_u16(lhs_low, rhs_low) };
+        let high = unsafe { vmull_u16(lhs_high, rhs_high) };
+
+        i16x8 { neon: unsafe { vuzpq_u16(vreinterpretq_u16_u32(low), vreinterpretq_u16_u32(high)).1 } }
+      } else if #[cfg(target_feature="simd128")] {
+        let low =  u32x4_extmul_low_u16x8(lhs.simd, rhs.simd);
+        let high = u32x4_extmul_high_u16x8(lhs.simd, rhs.simd);
+
+        Self { simd: i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) }
+      } else {
+        u16x8::new([
+          ((u32::from(rhs.as_array_ref()[0]) * u32::from(lhs.as_array_ref()[0])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[1]) * u32::from(lhs.as_array_ref()[1])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[2]) * u32::from(lhs.as_array_ref()[2])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[3]) * u32::from(lhs.as_array_ref()[3])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[4]) * u32::from(lhs.as_array_ref()[4])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[5]) * u32::from(lhs.as_array_ref()[5])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[6]) * u32::from(lhs.as_array_ref()[6])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[7]) * u32::from(lhs.as_array_ref()[7])) >> 16) as u16,
+        ])
+      }
+    }
+  }
+
   #[inline]
   pub fn to_array(self) -> [u16; 8] {
     cast(self)

diff --git a/src/u32x4_.rs b/src/u32x4_.rs
@@ -450,37 +450,6 @@ impl u32x4 {
     rhs.cmp_gt(self)
   }
 
-  /// Multiplies the 32 bit values lane 0 and 2 and
-  /// returns the corresponding 64 bit result in lanes 0 and 1.
-  #[inline]
-  #[must_use]
-  pub fn mul_widen_even(self: u32x4, rhs: u32x4) -> u64x2 {
-    pick! {
-      if #[cfg(target_feature="sse2")] {
-        // safe_arch calls this odd, but lane# are 0 based, right?
-        cast(mul_widen_u32_odd_m128i(self.sse, rhs.sse))
-      } else if #[cfg(target_feature="simd128")] {
-        u64x2 { simd: i64x2_mul(
-          v128_and(self.simd, u64x2_splat(0xffffffff)),
-          v128_and(rhs.simd, u64x2_splat(0xffffffff)) ) }
-      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
-        unsafe {
-          let a = vget_low_u32(vuzpq_u32(self.neon, self.neon).0);
-          let b = vget_low_u32(vuzpq_u32(rhs.neon, rhs.neon).0);
-
-          u64x2 { neon: vmull_u32(a, b) }
-        }
-      } else {
-        let a: [u32; 4] = cast(self);
-        let b: [u32; 4] = cast(rhs);
-        cast([
-          (a[0] as u64) * (b[0] as u64),
-          (a[2] as u64) * (b[2] as u64),
-        ])
-      }
-    }
-  }
-
   /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the result.
   /// Useful for implementing divide constant value (see t_usefulness example)
   #[inline]

diff --git a/src/u32x8_.rs b/src/u32x8_.rs
@@ -301,28 +301,11 @@ impl u32x8 {
     rhs.cmp_gt(self)
   }
 
-  /// Multiplies the 32 bit values lane 0, 2, 4, 6
-  /// returns the corresponding 64 bit result in lanes 0,1,2,3.
-  #[inline]
-  #[must_use]
-  pub fn mul_widen_even(self: u32x8, rhs: u32x8) -> u64x4 {
-    pick! {
-      if #[cfg(target_feature="avx2")] {
-        cast(mul_u64_low_bits_m256i(self.avx2, rhs.avx2))
-      } else {
-        u64x4 {
-          a : self.a.mul_widen_even(rhs.a),
-          b : self.b.mul_widen_even(rhs.b),
-        }
-      }
-    }
-  }
-
   /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the result.
   /// Useful for implementing divide constant value (see t_usefulness example)
   #[inline]
   #[must_use]
-  pub fn mul_keep_high(self: u32x8, rhs: u32x8) -> u32x8 {
+  pub fn mul_keep_high(self, rhs: u32x8) -> u32x8 {
     pick! {
       if #[cfg(target_feature="avx2")] {
         let a : [u32;8]= cast(self);
@@ -334,10 +317,10 @@ impl u32x8 {
 
         cast([r1[1], r1[3], r1[5], r1[7], r2[1], r2[3], r2[5], r2[7]])
       } else {
-        let a: [u32x4; 2] = cast(self);
-        let b: [u32x4; 2] = cast(rhs);
-
-        cast([a[0].mul_keep_high(b[0]), a[1].mul_keep_high(b[1])])
+        Self {
+          a : self.a.mul_keep_high(rhs.a),
+          b : self.b.mul_keep_high(rhs.b),
+        }
       }
     }
   }

diff --git a/tests/all_tests/t_i16x8.rs b/tests/all_tests/t_i16x8.rs
@@ -369,10 +369,27 @@ fn impl_i16x8_reduce_max() {
 
 #[test]
 fn impl_mul_keep_high() {
-  let a = i16x8::from([1, 200, 300, 4568, -1, -2, -3, -4]);
-  let b = i16x8::from([5, 600, 700, 8910, -15, -26, -37, 48]);
+  let a = i16x8::from([i16::MAX, 200, 300, 4568, -1, -2, -3, -4]);
+  let b = i16x8::from([i16::MIN, 600, 700, 8910, -15, -26, -37, 48]);
   let c: [i16; 8] = i16x8::mul_keep_high(a, b).into();
-  assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]);
+  assert_eq!(
+    c,
+    [
+      (i32::from(i16::MAX) * i32::from(i16::MIN) >> 16) as i16,
+      1,
+      3,
+      621,
+      0,
+      0,
+      0,
+      -1
+    ]
+  );
+
+  crate::test_random_vector_vs_scalar(
+    |a: i16x8, b| i16x8::mul_keep_high(a, b),
+    |a, b| ((i32::from(a) * i32::from(b)) >> 16) as i16,
+  );
 }
 
 #[test]

diff --git a/tests/all_tests/t_u16x8.rs b/tests/all_tests/t_u16x8.rs
@@ -218,6 +218,31 @@ fn impl_u16x8_from_u8x16_high() {
   assert_eq!(expected, actual);
 }
 
+#[test]
+fn impl_u16x8_mul_keep_high() {
+  let a = u16x8::from([u16::MAX, 200, 300, 4568, 1, 2, 3, 200]);
+  let b = u16x8::from([u16::MAX, 600, 700, 8910, 15, 26, 37, 600]);
+  let c: [u16; 8] = u16x8::mul_keep_high(a, b).into();
+  assert_eq!(
+    c,
+    [
+      (u32::from(u16::MAX) * u32::from(u16::MAX) >> 16) as u16,
+      1,
+      3,
+      621,
+      0,
+      0,
+      0,
+      1
+    ]
+  );
+
+  crate::test_random_vector_vs_scalar(
+    |a: u16x8, b| u16x8::mul_keep_high(a, b),
+    |a, b| ((u32::from(a) * u32::from(b)) >> 16) as u16,
+  );
+}
+
 #[test]
 fn impl_u16x8_mul_widen() {
   let a = u16x8::from([1, 2, 3, 4, 5, 6, i16::MAX as u16, u16::MAX]);

diff --git a/tests/all_tests/t_u32x4.rs b/tests/all_tests/t_u32x4.rs
@@ -236,16 +236,6 @@ fn test_u32x4_none() {
   assert!(a.none());
 }
 
-#[test]
-fn test_u32x4_mul_widen_even() {
-  let a = u32x4::from([10, 2 /*ignored*/, 0xffffffff, 4 /*ignored*/]);
-  let b = u32x4::from([50, 6 /*ignored*/, 0xffffffff, 8 /*ignored*/]);
-
-  let expected = u64x2::from([10 * 50, 0xffffffff * 0xffffffff]);
-  let actual = a.mul_widen_even(b);
-  assert_eq!(expected, actual);
-}
-
 #[test]
 fn impl_u32x4_mul_widen() {
   let a = u32x4::from([1, 2, 3 * 1000000, u32::MAX]);

diff --git a/tests/all_tests/t_u32x8.rs b/tests/all_tests/t_u32x8.rs
@@ -296,30 +296,10 @@ fn test_u32x8_none() {
   assert!(a.none());
 }
 
-#[test]
-fn test_u32x8_mul_widen_even() {
-  let a = u32x8::from([
-    1, 2, /*odd ignored*/
-    3, 4, /*odd ignored*/
-    5, 6, /*odd ignored*/
-    0xffffffff, 8, /*odd ignored*/
-  ]);
-  let b = u32x8::from([
-    9, 10, /*odd ignored*/
-    11, 12, /*odd ignored*/
-    13, 14, /*odd ignored*/
-    0xffffffff, 16, /*odd ignored*/
-  ]);
-
-  let expected = u64x4::from([1 * 9, 3 * 11, 5 * 13, 0xffffffff * 0xffffffff]);
-  let actual = a.mul_widen_even(b);
-  assert_eq!(expected, actual);
-}
-
 #[test]
 fn impl_u32x8_mul_keep_high() {
   crate::test_random_vector_vs_scalar(
-    |a: u32x8, b| a.mul_keep_high(b),
+    |a: u32x8, b| u32x8::mul_keep_high(a, b),
     |a, b| ((u64::from(a) * u64::from(b)) >> 32) as u32,
   );
 }
diff --git a/tests/all_tests/t_usefulness.rs b/tests/all_tests/t_usefulness.rs
@@ -391,7 +391,7 @@ fn generate_branch_free_divide_magic_shift(denom: u32x8) -> (u32x8, u32x8) {
 
 // using the previously generated magic and shift, calculate the division
 fn branch_free_divide(numerator: u32x8, magic: u32x8, shift: u32x8) -> u32x8 {
-  let q = numerator.mul_keep_high(magic);
+  let q = u32x8::mul_keep_high(numerator, magic);
 
   let t = ((numerator - q) >> 1) + q;
   t >> shift