diff --git a/src/denoise/mod.rs b/src/denoise/mod.rs index 309a9f005f..b53d4990ed 100644 --- a/src/denoise/mod.rs +++ b/src/denoise/mod.rs @@ -1,5 +1,6 @@ use crate::api::FrameQueue; use crate::cpu_features::CpuFeatureLevel; +use crate::util::{Aligned, AlignedBoxedSlice}; use crate::EncoderStatus; use arrayvec::ArrayVec; use cfg_if::cfg_if; @@ -296,13 +297,13 @@ where ) -> &mut (R2cFftHandler, FftHandler, FftHandler); fn do_filtering(&mut self, src: &[[Plane; 3]], dest: &mut Frame) { - let mut dftr = [0f32; BLOCK_VOLUME]; - let mut dftc = [Complex::::default(); COMPLEX_COUNT]; - let mut means = [Complex::::default(); COMPLEX_COUNT]; + let mut dftr = Aligned::new([0f32; BLOCK_VOLUME]); + let mut dftc = Aligned::new([Complex::::default(); COMPLEX_COUNT]); + let mut means = Aligned::new([Complex::::default(); COMPLEX_COUNT]); for p in 0..3 { let (pad_width, pad_height) = self.pad_dimensions(p); - let mut ebuff = vec![0f32; pad_width * pad_height]; + let mut ebuff = AlignedBoxedSlice::new(pad_width * pad_height, 0f32); let effective_height = self.effective_height(p); let src_stride = src[0][p].cfg.stride; let ebuff_stride = pad_width; @@ -324,23 +325,23 @@ where self.proc0( src_planes[z].get_unchecked(x..), self.hw().get_unchecked((BLOCK_AREA * z)..), - dftr.get_unchecked_mut((BLOCK_AREA * z)..), + dftr.data.get_unchecked_mut((BLOCK_AREA * z)..), src_stride, SB_SIZE, self.src_scale(), ); } - self.real_to_complex_3d(&dftr, &mut dftc); - self.remove_mean(&mut dftc, self.dftgc(), &mut means); + self.real_to_complex_3d(&dftr.data, &mut dftc.data); + self.remove_mean(&mut dftc.data, self.dftgc(), &mut means.data); - self.filter_coeffs(&mut dftc); + self.filter_coeffs(&mut dftc.data); - self.add_mean(&mut dftc, &means); - self.complex_to_real_3d(&dftc, &mut dftr); + self.add_mean(&mut dftc.data, &means.data); + self.complex_to_real_3d(&dftc.data, &mut dftr.data); self.proc1( - dftr.get_unchecked((TB_MIDPOINT * BLOCK_AREA)..), + dftr.data.get_unchecked((TB_MIDPOINT * BLOCK_AREA)..), self.hw().get_unchecked((TB_MIDPOINT * BLOCK_AREA)..), ebuff.get_unchecked_mut((y * ebuff_stride + x)..), SB_SIZE, @@ -405,7 +406,7 @@ where let s0 = s0.add(u * p0 + v); let s1 = s1.add(u * p0 + v); let dest = dest.add(u * p1 + v); - dest.write(dest.read() + s0.read() * s1.read()); + dest.write(s0.read().mul_add(s1.read(), dest.read())); } } } @@ -693,10 +694,10 @@ where pad_dimensions: ArrayVec<(usize, usize), 3>, effective_heights: ArrayVec, - hw: [f32; BLOCK_VOLUME], - dftgc: [Complex; COMPLEX_COUNT], + hw: Aligned<[f32; BLOCK_VOLUME]>, + dftgc: Aligned<[Complex; COMPLEX_COUNT]>, fft: (R2cFftHandler, FftHandler, FftHandler), - sigmas: [f32; CCNT2], + sigmas: Aligned<[f32; CCNT2]>, } impl DftDenoiserRust @@ -708,8 +709,8 @@ where pad_dimensions: ArrayVec<(usize, usize), 3>, effective_heights: ArrayVec, ) -> Self { - let hw = create_window(); - let mut dftgr = [0f32; BLOCK_VOLUME]; + let hw = Aligned::new(create_window()); + let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]); let fft = ( R2cFftHandler::new(SB_SIZE), @@ -719,15 +720,15 @@ where let mut wscale = 0.0f32; for k in 0..BLOCK_VOLUME { - dftgr[k] = 255.0 * hw[k]; - wscale += hw[k].powi(2); + dftgr.data[k] = 255.0 * hw.data[k]; + wscale += hw.data[k].powi(2); } let wscale = 1.0 / wscale; - let mut sigmas = [0f32; CCNT2]; - sigmas.fill(sigma / wscale); + let mut sigmas = Aligned::new([0f32; CCNT2]); + sigmas.data.fill(sigma / wscale); - let mut denoiser = DftDenoiserRust { + let mut denoiser = Self { dest_scale, src_scale, peak, @@ -736,11 +737,11 @@ where hw, fft, sigmas, - dftgc: [Complex::default(); COMPLEX_COUNT], + dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]), }; - let mut dftgc = [Complex::default(); COMPLEX_COUNT]; - denoiser.real_to_complex_3d(&dftgr, &mut dftgc); + let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]); + denoiser.real_to_complex_3d(&dftgr.data, &mut dftgc.data); denoiser.dftgc = dftgc; denoiser @@ -778,17 +779,17 @@ where #[inline(always)] fn hw(&self) -> &[f32; BLOCK_VOLUME] { - &self.hw + &self.hw.data } #[inline(always)] fn dftgc(&self) -> &[Complex; COMPLEX_COUNT] { - &self.dftgc + &self.dftgc.data } #[inline(always)] fn sigmas(&self) -> &[f32; CCNT2] { - &self.sigmas + &self.sigmas.data } #[inline(always)] diff --git a/src/denoise/x86.rs b/src/denoise/x86.rs index 4a5008dfa9..1b2d73773d 100644 --- a/src/denoise/x86.rs +++ b/src/denoise/x86.rs @@ -1,5 +1,12 @@ +use crate::util::Aligned; use arrayvec::ArrayVec; use ndrustfft::{Complex, FftHandler, R2cFftHandler}; +use std::arch::x86_64::{ + _mm256_castsi256_ps, _mm256_cvtepu16_epi32, _mm256_cvtepu8_epi32, + _mm256_load_ps, _mm256_mul_ps, _mm256_set1_ps, _mm256_store_ps, + _mm_load_si128, +}; +use std::mem::size_of; use v_frame::pixel::Pixel; use super::{ @@ -19,10 +26,10 @@ where pad_dimensions: ArrayVec<(usize, usize), 3>, effective_heights: ArrayVec, - hw: [f32; BLOCK_VOLUME], - dftgc: [Complex; COMPLEX_COUNT], + hw: Aligned<[f32; BLOCK_VOLUME]>, + dftgc: Aligned<[Complex; COMPLEX_COUNT]>, fft: (R2cFftHandler, FftHandler, FftHandler), - sigmas: [f32; CCNT2], + sigmas: Aligned<[f32; CCNT2]>, } impl DftDenoiserAvx2 @@ -35,8 +42,8 @@ where pad_dimensions: ArrayVec<(usize, usize), 3>, effective_heights: ArrayVec, ) -> Self { - let hw = create_window(); - let mut dftgr = [0f32; BLOCK_VOLUME]; + let hw = Aligned::new(create_window()); + let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]); let fft = ( R2cFftHandler::new(SB_SIZE), @@ -46,15 +53,15 @@ where let mut wscale = 0.0f32; for k in 0..BLOCK_VOLUME { - dftgr[k] = 255.0 * hw[k]; - wscale += hw[k].powi(2); + dftgr.data[k] = 255.0 * hw.data[k]; + wscale += hw.data[k].powi(2); } let wscale = 1.0 / wscale; - let mut sigmas = [0f32; CCNT2]; - sigmas.fill(sigma / wscale); + let mut sigmas = Aligned::new([0f32; CCNT2]); + sigmas.data.fill(sigma / wscale); - let mut denoiser = DftDenoiserAvx2 { + let mut denoiser = Self { dest_scale, src_scale, peak, @@ -63,11 +70,11 @@ where hw, fft, sigmas, - dftgc: [Complex::default(); COMPLEX_COUNT], + dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]), }; - let mut dftgc = [Complex::default(); COMPLEX_COUNT]; - denoiser.real_to_complex_3d(&dftgr, &mut dftgc); + let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]); + denoiser.real_to_complex_3d(&dftgr.data, &mut dftgc.data); denoiser.dftgc = dftgc; denoiser @@ -78,6 +85,40 @@ impl Denoiser for DftDenoiserAvx2 where T: Pixel, { + #[inline] + unsafe fn proc0( + &self, s0: &[T], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize, + src_scale: f32, + ) { + let s0 = s0.as_ptr(); + let s1 = s1.as_ptr(); + let dest = dest.as_mut_ptr(); + let src_scale = _mm256_set1_ps(src_scale); + + for u in 0..p1 { + for v in (0..p1).step_by(8) { + let s0 = s0.add(u * p0 + v); + let s1 = s1.add(u * p1 + v); + let dest = dest.add(u * p1 + v); + + let v_s0 = if size_of::() == 1 { + _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_load_si128(s0.cast()))) + } else { + _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_load_si128(s0.cast()))) + }; + let v_s1 = _mm256_load_ps(s1); + if size_of::() == 1 { + _mm256_store_ps(dest, _mm256_mul_ps(v_s0, v_s1)); + } else { + _mm256_store_ps( + dest, + _mm256_mul_ps(_mm256_mul_ps(v_s0, src_scale), v_s1), + ); + } + } + } + } + #[inline(always)] fn pad_dimensions(&self, plane: usize) -> (usize, usize) { self.pad_dimensions[plane] @@ -105,17 +146,17 @@ where #[inline(always)] fn hw(&self) -> &[f32; BLOCK_VOLUME] { - &self.hw + &self.hw.data } #[inline(always)] fn dftgc(&self) -> &[Complex; COMPLEX_COUNT] { - &self.dftgc + &self.dftgc.data } #[inline(always)] fn sigmas(&self) -> &[f32; CCNT2] { - &self.sigmas + &self.sigmas.data } #[inline(always)]