diff --git a/report/report.md b/report/report.md index 13e2e65..8099dc2 100644 --- a/report/report.md +++ b/report/report.md @@ -14,6 +14,7 @@ Sources used: wikimedia.i_love_you_california, wikimedia.winter_kiss, wikimedia. - Ours - default: 0.5443052357800366 + - st: 0.5443052357800366 - dmse: 0.5418910269181569 - bsbs: 0.5435087256676912 - mae: 0.5374008633412148 @@ -21,14 +22,15 @@ Sources used: wikimedia.i_love_you_california, wikimedia.winter_kiss, wikimedia. ### Average compression speed (inverse RTF) - Reference - - opt8lax: 259.5363983239151 - - opt8: 255.04180944185632 - - opt5: 480.08948488664015 + - opt8lax: 258.92532958937727 + - opt8: 262.24392534874113 + - opt5: 502.5549809373441 - Ours - - default: 147.73177741587608 - - dmse: 110.59936109083178 - - bsbs: 6.6560855133273495 - - mae: 32.21428513358941 + - default: 149.36375631265756 + - st: 57.107646528588624 + - dmse: 110.75909016976695 + - bsbs: 7.152928413350077 + - mae: 31.28330510072386 diff --git a/src/lib.rs b/src/lib.rs index 7b94b40..13ebeb5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,18 +71,18 @@ mod test { const FIXED_BLOCK_CONFIGS: [&str; 4] = [ "", - r#" + r" block_sizes = [512] - "#, - r#" + ", + r" block_sizes = [123] - "#, - r#" + ", + r" block_sizes = [1024] [subframe_coding.qlpc] use_direct_mse = true mae_optimization_steps = 2 - "#, + ", ]; #[rstest] diff --git a/src/rice.rs b/src/rice.rs index 3ed59cb..445af0b 100644 --- a/src/rice.rs +++ b/src/rice.rs @@ -15,7 +15,6 @@ //! Functions for partitioned rice coding (PRC). use std::cell::RefCell; -use std::simd::SimdOrd; use std::simd::SimdPartialEq; use std::simd::SimdPartialOrd; use std::simd::SimdUint; @@ -41,6 +40,8 @@ static MAXES: std::simd::u32x16 = std::simd::u32x16::from_array([u32::MAX; 16]); // max value of p_to_bits is chosen so that the estimates doesn't overflow // after added 2^4 = 16 times at maximum. +// The current version exploits the fact that `MAX_P_TO_BITS` is actually a bit mask, i.e. +// can be written as 2^N - 1 for faster processing. Do not use arbitrary value here. static MAX_P_TO_BITS: u32 = (1 << 28) - 1; static MAX_P_TO_BITS_VEC: std::simd::u32x16 = std::simd::u32x16::from_array([MAX_P_TO_BITS; 16]); @@ -68,26 +69,30 @@ impl PrcBitTable { for v in errors { // Below is faster than doing: // vs = splat(*v) >> INDEX; + // or + // vs = std::simd::u32x16::from_array(std::array::from_fn( + // |i| v >> i)); // Perhaps due to smaller memory footprint by avoiding `splat`? + let v = *v; let vs = std::simd::u32x16::from_array([ - *v, - *v >> 1, - *v >> 2, - *v >> 3, - *v >> 4, - *v >> 5, - *v >> 6, - *v >> 7, - *v >> 8, - *v >> 9, - *v >> 10, - *v >> 11, - *v >> 12, - *v >> 13, - *v >> 14, - *v >> 15, + v, + v >> 1, + v >> 2, + v >> 3, + v >> 4, + v >> 5, + v >> 6, + v >> 7, + v >> 8, + v >> 9, + v >> 10, + v >> 11, + v >> 12, + v >> 13, + v >> 14, + v >> 15, ]); - p_to_bits = (vs + p_to_bits).simd_min(MAX_P_TO_BITS_VEC); + p_to_bits = (vs + p_to_bits) & MAX_P_TO_BITS_VEC; } self.p_to_bits = p_to_bits; }