From 7b50807ddb2fc145220e6beb815de61b4b23f95d Mon Sep 17 00:00:00 2001 From: Yotaro Kubo Date: Tue, 3 Oct 2023 03:57:52 +0900 Subject: [PATCH] SIMD-ize auto-correlation function in LPC. --- report/report.md | 18 +++++++-------- src/fakesimd.rs | 38 +++++++++++++++++++++--------- src/lpc.rs | 60 +++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 88 insertions(+), 28 deletions(-) diff --git a/report/report.md b/report/report.md index a487c03..67755ab 100644 --- a/report/report.md +++ b/report/report.md @@ -17,20 +17,20 @@ Sources used: wikimedia.i_love_you_california, wikimedia.winter_kiss, wikimedia. - st: 0.5443052357800366 - dmse: 0.5418910269181569 - bsbs: 0.5435087256676912 - - mae: 0.5374008633412148 + - mae: 0.5374571118440491 ### Average compression speed (inverse RTF) - Reference - - opt8lax: 265.72298864106625 - - opt8: 262.0175361163109 - - opt5: 510.05557499954114 + - opt8lax: 257.2307394912905 + - opt8: 259.9417855241402 + - opt5: 498.18069606854715 - Ours - - default: 212.53670254739373 - - st: 84.90619245444263 - - dmse: 123.82711138553906 - - bsbs: 11.764164674514397 - - mae: 25.7701793126406 + - default: 222.1443707999417 + - st: 95.26297207652442 + - dmse: 128.94722929725017 + - bsbs: 13.508132367761204 + - mae: 26.8191475592669 diff --git a/src/fakesimd.rs b/src/fakesimd.rs index 8fccdce..dd1442e 100644 --- a/src/fakesimd.rs +++ b/src/fakesimd.rs @@ -22,6 +22,7 @@ use std::array; // TYPE DEFINITION // === #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)] +#[repr(transparent)] pub struct Simd([T; LANES]); #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)] @@ -45,6 +46,10 @@ impl SimdElement for u32 { type Mask = Self; } +impl SimdElement for f32 { + type Mask = Self; +} + pub trait SupportedLaneCount {} pub struct LaneCount(); impl SupportedLaneCount for LaneCount<16> {} @@ -87,6 +92,9 @@ pub type i32x16 = Simd; #[allow(non_camel_case_types)] pub type u32x16 = Simd; +#[allow(non_camel_case_types)] +pub type f32x32 = Simd; + // === // IMPLEMENTATION OF `fakesimd::Simd` (non-trait methods) // === @@ -233,6 +241,22 @@ macro_rules! def_binop { }; } +macro_rules! def_binop_assign { + ($trait_name:ident, $binop_name: ident, $fn_name:ident, $expr:expr) => { + impl std::ops::$trait_name for Simd + where + T: SimdElement, + Self: std::ops::$binop_name, + { + #[inline] + #[allow(clippy::redundant_closure_call)] + fn $fn_name(&mut self, rhs: U) { + *self = ($expr)(*self, rhs); + } + } + }; +} + def_binop!(Add, add, |x, y| x + y); def_binop!(Sub, sub, |x, y| x - y); def_binop!(Mul, mul, |x, y| x * y); @@ -240,6 +264,9 @@ def_binop!(Div, div, |x, y| x / y); def_binop!(BitAnd, bitand, |x, y| x & y); def_binop!(Shr, shr, |x, y| x >> y); +def_binop_assign!(AddAssign, Add, add_assign, |x, y| x + y); +def_binop_assign!(SubAssign, Sub, sub_assign, |x, y| x - y); + impl std::convert::AsRef<[T; N]> for Simd where T: SimdElement, @@ -282,14 +309,3 @@ where &mut self.as_mut_array()[index] } } - -impl std::ops::SubAssign for Simd -where - T: SimdElement, - Self: std::ops::Sub, -{ - #[inline] - fn sub_assign(&mut self, rhs: U) { - *self = *self - rhs; - } -} diff --git a/src/lpc.rs b/src/lpc.rs index a96e88e..d8822c6 100644 --- a/src/lpc.rs +++ b/src/lpc.rs @@ -329,6 +329,41 @@ pub fn delay_sum(order: usize, signal: &[f32], dest: &mut nalgebra::DMatrix weighted_delay_sum(order, signal, dest, |_t| 1.0f32); } +#[inline] +#[allow(clippy::needless_range_loop)] // for readability +#[allow(dead_code)] // not used in "fakesimd" but should still be compilable. +fn weighted_auto_correlation_simd(order: usize, signal: &[f32], dest: &mut [f32], weight_fn: F) +where + F: Fn(usize) -> f32, +{ + assert!(dest.len() >= order); + + let mut lagged = simd::f32x32::splat(0f32); + // acc0 is computed separately for better SIMD alignment. + let mut acc0 = 0.0f32; + let mut acc = simd::f32x32::splat(0f32); + for tau in 0..order - 1 { + lagged[tau] = signal[order - 2 - tau]; + } + for t in (order - 1)..signal.len() { + let w = weight_fn(t); + let y = signal[t]; + acc0 += w * y * y; + acc += simd::f32x32::splat(w * y) * lagged; + lagged = lagged.rotate_lanes_right::<1>(); + lagged[0] = y; + } + for (tau, mut_p) in dest.iter_mut().enumerate() { + *mut_p = if tau == 0 { + acc0 + } else if tau < order { + acc[tau - 1] + } else { + 0.0 + }; + } +} + /// Compute weighted auto-correlation coefficients. /// /// # Panics @@ -338,15 +373,24 @@ pub fn weighted_auto_correlation(order: usize, signal: &[f32], dest: &mut [f3 where F: Fn(usize) -> f32, { - assert!(dest.len() >= order); - for p in &mut *dest { - *p = 0.0; + #[cfg(not(feature = "fakesimd"))] + { + // The current implementation is inefficient with fakesimd when order + // is low. So, here we still have a scalar version of it. + weighted_auto_correlation_simd(order, signal, dest, weight_fn); } - for t in (order - 1)..signal.len() { - let w = weight_fn(t); - for tau in 0..order { - let v: f32 = signal[t] * signal[t - tau] * w; - dest[tau] += v; + #[cfg(feature = "fakesimd")] + { + assert!(dest.len() >= order); + for p in &mut *dest { + *p = 0.0; + } + for t in (order - 1)..signal.len() { + let w = weight_fn(t); + for tau in 0..order { + let v: f32 = signal[t] * signal[t - tau] * w; + dest[tau] += v; + } } } }