From 7b50807ddb2fc145220e6beb815de61b4b23f95d Mon Sep 17 00:00:00 2001
From: Yotaro Kubo <yotaro@google.com>
Date: Tue, 3 Oct 2023 03:57:52 +0900
Subject: [PATCH] SIMD-ize auto-correlation function in LPC.

---
 report/report.md | 18 +++++++--------
 src/fakesimd.rs  | 38 +++++++++++++++++++++---------
 src/lpc.rs       | 60 +++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 88 insertions(+), 28 deletions(-)
diff --git a/report/report.md b/report/report.md
index a487c03..67755ab 100644
--- a/report/report.md
+++ b/report/report.md
@@ -17,20 +17,20 @@ Sources used: wikimedia.i_love_you_california, wikimedia.winter_kiss, wikimedia.
     - st: 0.5443052357800366
     - dmse: 0.5418910269181569
     - bsbs: 0.5435087256676912
-    - mae: 0.5374008633412148
+    - mae: 0.5374571118440491
 
 
 ### Average compression speed (inverse RTF)
   - Reference
-    - opt8lax: 265.72298864106625
-    - opt8: 262.0175361163109
-    - opt5: 510.05557499954114
+    - opt8lax: 257.2307394912905
+    - opt8: 259.9417855241402
+    - opt5: 498.18069606854715
 
   - Ours
-    - default: 212.53670254739373
-    - st: 84.90619245444263
-    - dmse: 123.82711138553906
-    - bsbs: 11.764164674514397
-    - mae: 25.7701793126406
+    - default: 222.1443707999417
+    - st: 95.26297207652442
+    - dmse: 128.94722929725017
+    - bsbs: 13.508132367761204
+    - mae: 26.8191475592669
 
 
diff --git a/src/fakesimd.rs b/src/fakesimd.rs
index 8fccdce..dd1442e 100644
--- a/src/fakesimd.rs
+++ b/src/fakesimd.rs
@@ -22,6 +22,7 @@ use std::array;
 // TYPE DEFINITION
 // ===
 #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)]
+#[repr(transparent)]
 pub struct Simd<T: SimdElement, const LANES: usize>([T; LANES]);
 
 #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)]
@@ -45,6 +46,10 @@ impl SimdElement for u32 {
     type Mask = Self;
 }
 
+impl SimdElement for f32 {
+    type Mask = Self;
+}
+
 pub trait SupportedLaneCount {}
 pub struct LaneCount<const LANES: usize>();
 impl SupportedLaneCount for LaneCount<16> {}
@@ -87,6 +92,9 @@ pub type i32x16 = Simd<i32, 16>;
 #[allow(non_camel_case_types)]
 pub type u32x16 = Simd<u32, 16>;
 
+#[allow(non_camel_case_types)]
+pub type f32x32 = Simd<f32, 32>;
+
 // ===
 // IMPLEMENTATION OF `fakesimd::Simd` (non-trait methods)
 // ===
@@ -233,6 +241,22 @@ macro_rules! def_binop {
     };
 }
 
+macro_rules! def_binop_assign {
+    ($trait_name:ident, $binop_name: ident, $fn_name:ident, $expr:expr) => {
+        impl<U, T, const N: usize> std::ops::$trait_name<U> for Simd<T, N>
+        where
+            T: SimdElement,
+            Self: std::ops::$binop_name<U, Output = Self>,
+        {
+            #[inline]
+            #[allow(clippy::redundant_closure_call)]
+            fn $fn_name(&mut self, rhs: U) {
+                *self = ($expr)(*self, rhs);
+            }
+        }
+    };
+}
+
 def_binop!(Add, add, |x, y| x + y);
 def_binop!(Sub, sub, |x, y| x - y);
 def_binop!(Mul, mul, |x, y| x * y);
@@ -240,6 +264,9 @@ def_binop!(Div, div, |x, y| x / y);
 def_binop!(BitAnd, bitand, |x, y| x & y);
 def_binop!(Shr, shr, |x, y| x >> y);
 
+def_binop_assign!(AddAssign, Add, add_assign, |x, y| x + y);
+def_binop_assign!(SubAssign, Sub, sub_assign, |x, y| x - y);
+
 impl<T, const N: usize> std::convert::AsRef<[T; N]> for Simd<T, N>
 where
     T: SimdElement,
@@ -282,14 +309,3 @@ where
         &mut self.as_mut_array()[index]
     }
 }
-
-impl<U, T, const N: usize> std::ops::SubAssign<U> for Simd<T, N>
-where
-    T: SimdElement,
-    Self: std::ops::Sub<U, Output = Self>,
-{
-    #[inline]
-    fn sub_assign(&mut self, rhs: U) {
-        *self = *self - rhs;
-    }
-}
diff --git a/src/lpc.rs b/src/lpc.rs
index a96e88e..d8822c6 100644
--- a/src/lpc.rs
+++ b/src/lpc.rs
@@ -329,6 +329,41 @@ pub fn delay_sum(order: usize, signal: &[f32], dest: &mut nalgebra::DMatrix<f32>
     weighted_delay_sum(order, signal, dest, |_t| 1.0f32);
 }
 
+#[inline]
+#[allow(clippy::needless_range_loop)] // for readability
+#[allow(dead_code)] // not used in "fakesimd" but should still be compilable.
+fn weighted_auto_correlation_simd<F>(order: usize, signal: &[f32], dest: &mut [f32], weight_fn: F)
+where
+    F: Fn(usize) -> f32,
+{
+    assert!(dest.len() >= order);
+
+    let mut lagged = simd::f32x32::splat(0f32);
+    // acc0 is computed separately for better SIMD alignment.
+    let mut acc0 = 0.0f32;
+    let mut acc = simd::f32x32::splat(0f32);
+    for tau in 0..order - 1 {
+        lagged[tau] = signal[order - 2 - tau];
+    }
+    for t in (order - 1)..signal.len() {
+        let w = weight_fn(t);
+        let y = signal[t];
+        acc0 += w * y * y;
+        acc += simd::f32x32::splat(w * y) * lagged;
+        lagged = lagged.rotate_lanes_right::<1>();
+        lagged[0] = y;
+    }
+    for (tau, mut_p) in dest.iter_mut().enumerate() {
+        *mut_p = if tau == 0 {
+            acc0
+        } else if tau < order {
+            acc[tau - 1]
+        } else {
+            0.0
+        };
+    }
+}
+
 /// Compute weighted auto-correlation coefficients.
 ///
 /// # Panics
@@ -338,15 +373,24 @@ pub fn weighted_auto_correlation<F>(order: usize, signal: &[f32], dest: &mut [f3
 where
     F: Fn(usize) -> f32,
 {
-    assert!(dest.len() >= order);
-    for p in &mut *dest {
-        *p = 0.0;
+    #[cfg(not(feature = "fakesimd"))]
+    {
+        // The current implementation is inefficient with fakesimd when order
+        // is low. So, here we still have a scalar version of it.
+        weighted_auto_correlation_simd(order, signal, dest, weight_fn);
     }
-    for t in (order - 1)..signal.len() {
-        let w = weight_fn(t);
-        for tau in 0..order {
-            let v: f32 = signal[t] * signal[t - tau] * w;
-            dest[tau] += v;
+    #[cfg(feature = "fakesimd")]
+    {
+        assert!(dest.len() >= order);
+        for p in &mut *dest {
+            *p = 0.0;
+        }
+        for t in (order - 1)..signal.len() {
+            let w = weight_fn(t);
+            for tau in 0..order {
+                let v: f32 = signal[t] * signal[t - tau] * w;
+                dest[tau] += v;
+            }
         }
     }
 }