From 74c1bf7aeae6a0920c11b1bbbbb8f79b7b0d1ee2 Mon Sep 17 00:00:00 2001 From: jatin Date: Fri, 17 May 2024 16:43:00 -0700 Subject: [PATCH] Fixing NEON up2 implementation --- .../poly_octave/PolyOctaveV2FilterBankImpl.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp b/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp index 8fb4f6b0..05c4715d 100644 --- a/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp +++ b/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp @@ -307,11 +307,11 @@ void process (ComplexERBFilterBank& filter_bank, { // buffer_out size is padded by 4x static constexpr auto eps = std::numeric_limits::epsilon(); - const auto norm_gain = 1.0f / std::pow (static_cast (N), static_cast (num_octaves_up)); + static constexpr auto norm_gain = 2.0f / static_cast (N); #if defined(__ARM_NEON__) - auto* buffer_out_simd = reinterpret_cast (juce::snapPointerToAlignment (buffer_out, 16)); - std::fill (buffer_out_simd, buffer_out_simd + num_samples, T {}); + auto* buffer_out_simd = reinterpret_cast (snapPointerToAlignment (buffer_out, 16)); + std::fill (buffer_out_simd, buffer_out_simd + num_samples, float32x4_t {}); const auto eps_neon = vld1q_dup_f32 (&eps); const auto norm_gain_neon = vld1q_dup_f32 (&norm_gain); @@ -353,6 +353,11 @@ void process (ComplexERBFilterBank& filter_bank, } else if constexpr (num_octaves_up == 2) { + const auto greater_than_eps = vcgtq_f32 (x_abs_sq, eps_neon); + const auto x_abs_r = vbslq_f32 (greater_than_eps, vrecpeq_f32 (x_abs_sq), zero_neon); + const auto x_im_sq_x3 = vaddq_f32 (x_im_sq, vaddq_f32 (x_im_sq, x_im_sq)); + buffer_out_simd[n] = vfmaq_f32 (buffer_out_simd[n], vsubq_f32 (x_re_sq, x_im_sq_x3), vmulq_f32 (x_re, x_abs_r)); + // @TODO // auto x_abs_sq_r = xsimd::select (x_abs_sq > eps, xsimd::reciprocal (x_abs_sq), {}); // buffer_out_simd[n] += x_re * (x_re_sq - (S) 3 * x_im_sq) * x_abs_sq_r; @@ -519,7 +524,7 @@ void process_avx ([[maybe_unused]] ComplexERBFilterBank& filter_bank, { // buffer_out size is padded by 8x [[maybe_unused]] static constexpr auto eps = std::numeric_limits::epsilon(); - [[maybe_unused]] const auto norm_gain = 1.0f / std::pow (static_cast (N), static_cast (num_octaves_up)); + [[maybe_unused]] static constexpr auto norm_gain = 2.0f / static_cast (N); #if defined(__AVX__)