From cb19dc500564760b35deca386964ed3f10846960 Mon Sep 17 00:00:00 2001 From: jatinchowdhury18 Date: Mon, 29 Apr 2024 20:21:26 -0700 Subject: [PATCH 1/2] Poly octave updates (#357) * Setting up v1 mode * Poly octave working with new filter bank * Re-tuning filterbank params * Update CHANGELOG * Apply clang-format --------- Co-authored-by: github-actions[bot] --- CHANGELOG.md | 3 + CMakeLists.txt | 4 +- res/presets/OctaVerb.chowpreset | 5 +- .../other/poly_octave/PolyOctave.cpp | 188 +++++++++++++----- src/processors/other/poly_octave/PolyOctave.h | 41 ++-- .../poly_octave/PolyOctaveFilterBandHelpers.h | 185 ++++++++++++++++- .../poly_octave/PolyOctaveFilterBankTypes.h | 38 ++++ 7 files changed, 389 insertions(+), 75 deletions(-) create mode 100644 src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 1260c86f..6a4b2c79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. +## [UNRELEASED] +- Updated "Poly Octave" DSP to sound cleaner and use less CPU + ## [1.3.0] - 2024-03-15 - Added "Laser Trem" module. - Added "Poly Octave" module. diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fea0aa0..4ce5c948 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,9 +4,9 @@ if(WIN32) set(CMAKE_SYSTEM_VERSION 7.1 CACHE STRING INTERNAL FORCE) # Windows SDK for Windows 7 and up endif() if(APPLE AND (CMAKE_SYSTEM_NAME STREQUAL "iOS")) - project(BYOD VERSION 2.1.0) + project(BYOD VERSION 2.1.1) else() - project(BYOD VERSION 1.3.0) + project(BYOD VERSION 1.3.1) endif() set(CMAKE_CXX_STANDARD 20) diff --git a/res/presets/OctaVerb.chowpreset b/res/presets/OctaVerb.chowpreset index 0c3bafb5..4fd96358 100644 --- a/res/presets/OctaVerb.chowpreset +++ b/res/presets/OctaVerb.chowpreset @@ -1,13 +1,14 @@ - - + + + diff --git a/src/processors/other/poly_octave/PolyOctave.cpp b/src/processors/other/poly_octave/PolyOctave.cpp index 99c9ab11..3057c175 100644 --- a/src/processors/other/poly_octave/PolyOctave.cpp +++ b/src/processors/other/poly_octave/PolyOctave.cpp @@ -8,6 +8,7 @@ const String dryTag = "dry"; const String upOctaveTag = "up_octave"; const String up2OctaveTag = "up2_octave"; const String downOctaveTag = "down_octave"; +const String v1Tag = "v1_mode"; } // namespace PolyOctaveTags PolyOctave::PolyOctave (UndoManager* um) @@ -34,6 +35,9 @@ PolyOctave::PolyOctave (UndoManager* um) setupGainParam (PolyOctaveTags::upOctaveTag, upOctaveGain); setupGainParam (PolyOctaveTags::up2OctaveTag, up2OctaveGain); + ParameterHelpers::loadParameterPointer (v1_mode, vts, PolyOctaveTags::v1Tag); + addPopupMenuParameter (PolyOctaveTags::v1Tag); + uiOptions.backgroundColour = Colour { 0xff9fa09d }; uiOptions.powerColour = Colour { 0xffe70510 }; uiOptions.info.description = "A \"polyphonic octave generator\" effect."; @@ -49,6 +53,7 @@ ParamLayout PolyOctave::createParameterLayout() createPercentParameter (params, PolyOctaveTags::dryTag, "+0 Oct", 0.5f); createPercentParameter (params, PolyOctaveTags::upOctaveTag, "+1 Oct", 0.0f); createPercentParameter (params, PolyOctaveTags::up2OctaveTag, "+2 Oct", 0.0f); + emplace_param (params, PolyOctaveTags::v1Tag, "V1 Mode", false); // @TODO: version streaming return { params.begin(), params.end() }; } @@ -61,14 +66,22 @@ void PolyOctave::prepare (double sampleRate, int samplesPerBlock) downOctaveGain.prepare (sampleRate, samplesPerBlock); doubleBuffer.setMaxSize (2, samplesPerBlock); - up2OctaveBuffer.setMaxSize (2, static_cast (ComplexERBFilterBank::float_2::size) * samplesPerBlock); // allocate extra space for SIMD - upOctaveBuffer.setMaxSize (2, static_cast (ComplexERBFilterBank::float_2::size) * samplesPerBlock); // allocate extra space for SIMD - downOctaveBuffer.setMaxSize (2, samplesPerBlock); - - FilterBankHelpers::designFilterBank (octaveUpFilterBank, 2.0, 5.0, 6.0, sampleRate); - FilterBankHelpers::designFilterBank (octaveUp2FilterBank, 3.0, 7.0, 4.0, sampleRate); + up2OctaveBuffer_double.setMaxSize (2, 2 * samplesPerBlock); // allocate extra space for SIMD + upOctaveBuffer_double.setMaxSize (2, 2 * samplesPerBlock); // allocate extra space for SIMD + downOctaveBuffer_double.setMaxSize (2, samplesPerBlock); + poly_octave_v2::design_filter_bank (octaveUpFilterBank, 2.0, 5.0, 4.5, sampleRate); + poly_octave_v2::design_filter_bank (octaveUp2FilterBank, 3.0, 6.0, 2.75, sampleRate); for (auto& shifter : downOctavePitchShifters) + { + shifter.prepare (sampleRate); + shifter.set_pitch_factor (0.5f); + } + + poly_octave_v1::designFilterBank (octaveUpFilterBank_v1, 2.0, 5.0, 6.0, sampleRate); + poly_octave_v1::designFilterBank (octaveUp2FilterBank_v1, 3.0, 7.0, 4.0, sampleRate); + + for (auto& shifter : downOctavePitchShifters_v1) { shifter.prepare (sampleRate); shifter.set_pitch_factor (0.5); @@ -81,51 +94,117 @@ void PolyOctave::prepare (double sampleRate, int samplesPerBlock) } mixOutBuffer.setSize (2, samplesPerBlock); - up1OutBuffer.setSize (2, samplesPerBlock); - up2OutBuffer.setSize (2, samplesPerBlock); - down1OutBuffer.setSize (2, samplesPerBlock); + up1OutBuffer.setSize (2, 4 * samplesPerBlock + 8); // padding for SIMD + up2OutBuffer.setSize (2, 4 * samplesPerBlock + 8); // padding for SIMD + down1OutBuffer.setSize (2, 4 * samplesPerBlock + 8); // padding for SIMD } void PolyOctave::processAudio (AudioBuffer& buffer) +{ + if (v1_mode->get()) + { + processAudioV1 (buffer); + return; + } + + const auto numChannels = buffer.getNumChannels(); + const auto numSamples = buffer.getNumSamples(); + + mixOutBuffer.setSize (numChannels, numSamples, false, false, true); + up1OutBuffer.setSize (numChannels, numSamples, false, false, true); + up2OutBuffer.setSize (numChannels, numSamples, false, false, true); + down1OutBuffer.setSize (numChannels, numSamples, false, false, true); + + // "down" processing + for (auto [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (buffer), down1OutBuffer)) + { + for (auto [x, y] : chowdsp::zip (data_in, data_out)) + y = downOctavePitchShifters[(size_t) ch].process_sample (x); + } + downOctaveGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (down1OutBuffer, downOctaveGain); + + // "up1" processing + for (auto [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (buffer), up1OutBuffer)) + { + poly_octave_v2::process<1> (octaveUpFilterBank[ch], + data_in.data(), + data_out.data(), + numSamples); + } + upOctaveGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (up1OutBuffer, upOctaveGain); + + // "up2" processing + for (auto [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (buffer), up2OutBuffer)) + { + poly_octave_v2::process<2> (octaveUp2FilterBank[ch], + data_in.data(), + data_out.data(), + numSamples); + } + up2OctaveGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (up2OutBuffer, up2OctaveGain); + + chowdsp::BufferMath::copyBufferData (buffer, mixOutBuffer); + dryGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (mixOutBuffer, dryGain); + + chowdsp::BufferMath::addBufferData (up1OutBuffer, mixOutBuffer); + chowdsp::BufferMath::addBufferData (up2OutBuffer, mixOutBuffer); + chowdsp::BufferMath::addBufferData (down1OutBuffer, mixOutBuffer); + + dcBlocker[MixOutput].processBlock (mixOutBuffer); + dcBlocker[Up1Output].processBlock (up1OutBuffer); + dcBlocker[Up2Output].processBlock (up2OutBuffer); + dcBlocker[Down1Output].processBlock (down1OutBuffer); + + outputBuffers.getReference (MixOutput) = &mixOutBuffer; + outputBuffers.getReference (Up1Output) = &up1OutBuffer; + outputBuffers.getReference (Up2Output) = &up2OutBuffer; + outputBuffers.getReference (Down1Output) = &down1OutBuffer; +} + +void PolyOctave::processAudioV1 (AudioBuffer& buffer) { const auto numChannels = buffer.getNumChannels(); const auto numSamples = buffer.getNumSamples(); doubleBuffer.setCurrentSize (numChannels, numSamples); - upOctaveBuffer.setCurrentSize (numChannels, numSamples); - up2OctaveBuffer.setCurrentSize (numChannels, numSamples); - downOctaveBuffer.setCurrentSize (numChannels, numSamples); + upOctaveBuffer_double.setCurrentSize (numChannels, numSamples); + up2OctaveBuffer_double.setCurrentSize (numChannels, numSamples); + downOctaveBuffer_double.setCurrentSize (numChannels, numSamples); chowdsp::BufferMath::copyBufferData (buffer, doubleBuffer); // "down" processing - for (auto [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (doubleBuffer), downOctaveBuffer)) + for (auto [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (doubleBuffer), downOctaveBuffer_double)) { for (auto [x, y] : chowdsp::zip (data_in, data_out)) - y = downOctavePitchShifters[(size_t) ch].process_sample (x); + y = downOctavePitchShifters_v1[(size_t) ch].process_sample (x); } // "up" processing - using float_2 = ComplexERBFilterBank::float_2; for (int ch = 0; ch < numChannels; ++ch) { + using poly_octave_v1::float_2; auto* dryData = doubleBuffer.getReadPointer (ch); - auto* upData = upOctaveBuffer.getWritePointer (ch); - auto* upDataSIMD = reinterpret_cast (upOctaveBuffer.getWritePointer (ch)); + auto* upData = upOctaveBuffer_double.getWritePointer (ch); + auto* upDataSIMD = reinterpret_cast (upOctaveBuffer_double.getWritePointer (ch)); jassert (juce::snapPointerToAlignment (upDataSIMD, xsimd::default_arch::alignment()) == upDataSIMD); std::fill (upDataSIMD, upDataSIMD + numSamples, float_2 {}); - auto* up2Data = up2OctaveBuffer.getWritePointer (ch); - auto* up2DataSIMD = reinterpret_cast (up2OctaveBuffer.getWritePointer (ch)); + auto* up2Data = up2OctaveBuffer_double.getWritePointer (ch); + auto* up2DataSIMD = reinterpret_cast (up2OctaveBuffer_double.getWritePointer (ch)); jassert (juce::snapPointerToAlignment (up2DataSIMD, xsimd::default_arch::alignment()) == up2DataSIMD); std::fill (up2DataSIMD, up2DataSIMD + numSamples, float_2 {}); - auto& upFilterBank = octaveUpFilterBank[static_cast (ch)]; - auto& up2FilterBank = octaveUp2FilterBank[static_cast (ch)]; + auto& upFilterBank = octaveUpFilterBank_v1[static_cast (ch)]; + auto& up2FilterBank = octaveUp2FilterBank_v1[static_cast (ch)]; static constexpr auto eps = std::numeric_limits::epsilon(); - for (size_t k = 0; k < ComplexERBFilterBank::numFilterBands; k += float_2::size) + for (size_t k = 0; k < poly_octave_v1::ComplexERBFilterBank::numFilterBands; k += float_2::size) { const auto filter_idx = k / float_2::size; auto& realFilter = upFilterBank.erbFilterReal[filter_idx]; @@ -134,8 +213,8 @@ void PolyOctave::processAudio (AudioBuffer& buffer) chowdsp::ScopedValue z_im { imagFilter.z[0] }; for (int n = 0; n < numSamples; ++n) { - auto x_re = FilterBankHelpers::processSample (realFilter, z_re.get(), dryData[n]); - auto x_im = FilterBankHelpers::processSample (imagFilter, z_im.get(), dryData[n]); + auto x_re = poly_octave_v1::processSample (realFilter, z_re.get(), dryData[n]); + auto x_im = poly_octave_v1::processSample (imagFilter, z_im.get(), dryData[n]); auto x_re_sq = x_re * x_re; auto x_im_sq = x_im * x_im; @@ -153,7 +232,7 @@ void PolyOctave::processAudio (AudioBuffer& buffer) for (int n = 0; n < numSamples; ++n) upData[n] = xsimd::reduce_add (upDataSIMD[n]); - for (size_t k = 0; k < ComplexERBFilterBank::numFilterBands; k += float_2::size) + for (size_t k = 0; k < poly_octave_v1::ComplexERBFilterBank::numFilterBands; k += float_2::size) { const auto filter_idx = k / float_2::size; auto& realFilter = up2FilterBank.erbFilterReal[filter_idx]; @@ -163,8 +242,8 @@ void PolyOctave::processAudio (AudioBuffer& buffer) for (int n = 0; n < numSamples; ++n) { - auto x_re = FilterBankHelpers::processSample (realFilter, z_re.get(), dryData[n]); - auto x_im = FilterBankHelpers::processSample (imagFilter, z_im.get(), dryData[n]); + auto x_re = poly_octave_v1::processSample (realFilter, z_re.get(), dryData[n]); + auto x_im = poly_octave_v1::processSample (imagFilter, z_im.get(), dryData[n]); auto x_re_sq = x_re * x_re; auto x_im_sq = x_im * x_im; @@ -178,23 +257,9 @@ void PolyOctave::processAudio (AudioBuffer& buffer) up2Data[n] = xsimd::reduce_add (up2DataSIMD[n]); } - chowdsp::BufferMath::applyGain (upOctaveBuffer, 2.0 / static_cast (ComplexERBFilterBank::numFilterBands)); - upOctaveGain.process (numSamples); - chowdsp::BufferMath::applyGainSmoothedBuffer (upOctaveBuffer, upOctaveGain); - - chowdsp::BufferMath::applyGain (up2OctaveBuffer, 2.0 / static_cast (ComplexERBFilterBank::numFilterBands)); - up2OctaveGain.process (numSamples); - chowdsp::BufferMath::applyGainSmoothedBuffer (up2OctaveBuffer, up2OctaveGain); - - chowdsp::BufferMath::applyGain (downOctaveBuffer, juce::Decibels::decibelsToGain (3.0f)); - downOctaveGain.process (numSamples); - chowdsp::BufferMath::applyGainSmoothedBuffer (downOctaveBuffer, downOctaveGain); - - dryGain.process (numSamples); - chowdsp::BufferMath::applyGainSmoothedBuffer (doubleBuffer, dryGain); - chowdsp::BufferMath::addBufferData (up2OctaveBuffer, doubleBuffer); - chowdsp::BufferMath::addBufferData (upOctaveBuffer, doubleBuffer); - chowdsp::BufferMath::addBufferData (downOctaveBuffer, doubleBuffer); + chowdsp::BufferMath::applyGain (upOctaveBuffer_double, 2.0 / static_cast (poly_octave_v1::ComplexERBFilterBank::numFilterBands)); + chowdsp::BufferMath::applyGain (up2OctaveBuffer_double, 2.0 / static_cast (poly_octave_v1::ComplexERBFilterBank::numFilterBands)); + chowdsp::BufferMath::applyGain (downOctaveBuffer_double, juce::Decibels::decibelsToGain (3.0f)); mixOutBuffer.setSize (numChannels, numSamples, false, false, true); up1OutBuffer.setSize (numChannels, numSamples, false, false, true); @@ -202,9 +267,22 @@ void PolyOctave::processAudio (AudioBuffer& buffer) down1OutBuffer.setSize (numChannels, numSamples, false, false, true); chowdsp::BufferMath::copyBufferData (doubleBuffer, mixOutBuffer); - chowdsp::BufferMath::copyBufferData (upOctaveBuffer, up1OutBuffer); - chowdsp::BufferMath::copyBufferData (up2OctaveBuffer, up2OutBuffer); - chowdsp::BufferMath::copyBufferData (downOctaveBuffer, down1OutBuffer); + chowdsp::BufferMath::copyBufferData (upOctaveBuffer_double, up1OutBuffer); + chowdsp::BufferMath::copyBufferData (up2OctaveBuffer_double, up2OutBuffer); + chowdsp::BufferMath::copyBufferData (downOctaveBuffer_double, down1OutBuffer); + + upOctaveGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (up1OutBuffer, upOctaveGain); + up2OctaveGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (up2OutBuffer, up2OctaveGain); + downOctaveGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (down1OutBuffer, downOctaveGain); + + dryGain.process (numSamples); + chowdsp::BufferMath::applyGainSmoothedBuffer (mixOutBuffer, dryGain); + chowdsp::BufferMath::addBufferData (up1OutBuffer, mixOutBuffer); + chowdsp::BufferMath::addBufferData (up2OutBuffer, mixOutBuffer); + chowdsp::BufferMath::addBufferData (down1OutBuffer, mixOutBuffer); dcBlocker[MixOutput].processBlock (mixOutBuffer); dcBlocker[Up1Output].processBlock (up1OutBuffer); @@ -256,3 +334,19 @@ String PolyOctave::getTooltipForPort (int portIndex, bool isInput) return BaseProcessor::getTooltipForPort (portIndex, isInput); } + +void PolyOctave::fromXML (XmlElement* xml, const chowdsp::Version& version, bool loadPosition) +{ + BaseProcessor::fromXML (xml, version, loadPosition); + + using namespace std::string_view_literals; +#if JUCE_IOS + if (version <= chowdsp::Version { "2.1.0"sv }) +#else + if (version <= chowdsp::Version { "1.3.0"sv }) +#endif + { + // For versions before 1.3.0, default to v1 mode + v1_mode->setValueNotifyingHost (1.0f); + } +} diff --git a/src/processors/other/poly_octave/PolyOctave.h b/src/processors/other/poly_octave/PolyOctave.h index 08ebf96c..2ef85d2c 100644 --- a/src/processors/other/poly_octave/PolyOctave.h +++ b/src/processors/other/poly_octave/PolyOctave.h @@ -1,6 +1,7 @@ #pragma once #include "DelayPitchShifter.h" +#include "PolyOctaveFilterBankTypes.h" #include "processors/BaseProcessor.h" class PolyOctave : public BaseProcessor @@ -15,14 +16,9 @@ class PolyOctave : public BaseProcessor void processAudio (AudioBuffer& buffer) override; void processAudioBypassed (AudioBuffer& buffer) override; - String getTooltipForPort (int portIndex, bool isInput) override; + void fromXML (XmlElement* xml, const chowdsp::Version& version, bool loadPosition) override; - struct ComplexERBFilterBank - { - static constexpr size_t numFilterBands = 44; - using float_2 = xsimd::batch; - std::array, numFilterBands / float_2::size> erbFilterReal, erbFilterImag; - }; + String getTooltipForPort (int portIndex, bool isInput) override; enum OutputPort { @@ -35,19 +31,28 @@ class PolyOctave : public BaseProcessor static constexpr auto numOutputs = (int) magic_enum::enum_count(); private: - chowdsp::SmoothedBufferValue dryGain {}; - chowdsp::SmoothedBufferValue upOctaveGain {}; - chowdsp::SmoothedBufferValue up2OctaveGain {}; - chowdsp::SmoothedBufferValue downOctaveGain {}; + void processAudioV1 (AudioBuffer& buffer); - chowdsp::Buffer doubleBuffer; - chowdsp::Buffer upOctaveBuffer; - chowdsp::Buffer up2OctaveBuffer; - chowdsp::Buffer downOctaveBuffer; + chowdsp::BoolParameter* v1_mode = nullptr; + + chowdsp::SmoothedBufferValue dryGain {}; + chowdsp::SmoothedBufferValue upOctaveGain {}; + chowdsp::SmoothedBufferValue up2OctaveGain {}; + chowdsp::SmoothedBufferValue downOctaveGain {}; - std::array octaveUpFilterBank; - std::array octaveUp2FilterBank; - std::array, 2> downOctavePitchShifters; + // V2 processing stuff... + std::array, 2> octaveUpFilterBank; + std::array, 2> octaveUp2FilterBank; + std::array, 2> downOctavePitchShifters; + + // V1 processing stuff... + chowdsp::Buffer doubleBuffer; + chowdsp::Buffer upOctaveBuffer_double; + chowdsp::Buffer up2OctaveBuffer_double; + chowdsp::Buffer downOctaveBuffer_double; + std::array octaveUpFilterBank_v1; + std::array octaveUp2FilterBank_v1; + std::array, 2> downOctavePitchShifters_v1; std::array, (size_t) numOutputs> dcBlocker; diff --git a/src/processors/other/poly_octave/PolyOctaveFilterBandHelpers.h b/src/processors/other/poly_octave/PolyOctaveFilterBandHelpers.h index 02d02a36..e1d8553b 100644 --- a/src/processors/other/poly_octave/PolyOctaveFilterBandHelpers.h +++ b/src/processors/other/poly_octave/PolyOctaveFilterBandHelpers.h @@ -1,13 +1,12 @@ #pragma once -#include "PolyOctave.h" +#include "PolyOctaveFilterBankTypes.h" -namespace FilterBankHelpers +namespace poly_octave_v1 { // Reference for filter-bank design and octave shifting: // https://aaltodoc.aalto.fi/server/api/core/bitstreams/ff9e52cf-fd79-45eb-b695-93038244ec0e/content -using float_2 = PolyOctave::ComplexERBFilterBank::float_2; static_assert (float_2::size == 2); static_assert (xsimd::batch::size == 2); static constexpr auto vec_size = float_2::size; @@ -89,13 +88,13 @@ static auto designERBFilter (size_t erb_index, return center_target_freq; } -static void designFilterBank (std::array& filterBank, +static void designFilterBank (std::array& filterBank, double gamma, double erb_start, double q_ERB, double sampleRate) { - for (size_t kiter = 0; kiter < PolyOctave::ComplexERBFilterBank::numFilterBands; kiter += vec_size) + for (size_t kiter = 0; kiter < ComplexERBFilterBank::numFilterBands; kiter += vec_size) { alignas (16) std::array b_coeffs_cplx_real_simd[5]; alignas (16) std::array b_coeffs_cplx_imag_simd[5]; @@ -160,4 +159,178 @@ static void designFilterBank (std::array& f } } } -} // namespace FilterBankHelpers +} // namespace poly_octave_v1 + +namespace poly_octave_v2 +{ +// Reference for filter-bank design and octave shifting: +// https://aaltodoc.aalto.fi/server/api/core/bitstreams/ff9e52cf-fd79-45eb-b695-93038244ec0e/content + +inline std::pair process_sample (const T& x, + const std::array& b_shared_coeffs, + const std::array& b_real_coeffs, + const std::array& b_imag_coeffs, + const std::array& a_coeffs, + std::array& z_shared, + std::array& z_real, + std::array& z_imag) +{ + const auto y_shared = z_shared[1] + x * b_shared_coeffs[0]; + z_shared[1] = z_shared[2] + x * b_shared_coeffs[1] - y_shared * a_coeffs[1]; + z_shared[2] = x * b_shared_coeffs[2] - y_shared * a_coeffs[2]; + + const auto y_real = z_real[1] + y_shared; // for the real filter, we know that b[0] == 1 + z_real[1] = z_real[2] + y_shared * b_real_coeffs[1] - y_real * a_coeffs[1]; + z_real[2] = y_shared * b_real_coeffs[2] - y_real * a_coeffs[2]; + + const auto y_imag = z_imag[1]; // for the imaginary filter, we know that b[0] == 0 + z_imag[1] = z_imag[2] + y_shared * b_imag_coeffs[1] - y_imag * a_coeffs[1]; + z_imag[2] = y_shared * b_imag_coeffs[2] - y_imag * a_coeffs[2]; + + return { y_real, y_imag }; +} + +static constexpr auto q_c = 4.0; +static auto design_erb_filter (size_t erb_index, + double gamma, + double erb_start, + double q_ERB, + double sample_rate, + double (&b_coeffs_cplx_shared)[3], + double (&b_coeffs_cplx_real)[3], + double (&b_coeffs_cplx_imag)[3], + double (&a_coeffs_cplx)[3]) +{ + const auto q_PS = gamma; + + const auto z = erb_start + static_cast (erb_index) * (q_c / q_ERB); + const auto center_target_freq = 228.7 * (std::pow (10.0, z / 21.3) - 1.0); + const auto filter_q = (1.0 / (q_PS * q_ERB)) * (24.7 + 0.108 * center_target_freq); + + double a_coeffs_proto[3]; + chowdsp::CoefficientCalculators::calcSecondOrderBPF (b_coeffs_cplx_shared, + a_coeffs_proto, + center_target_freq / gamma, + filter_q * 0.5, + sample_rate); + + auto pole = (std::sqrt (std::pow (std::complex { a_coeffs_proto[1] }, 2.0) - 4.0 * a_coeffs_proto[2]) - a_coeffs_proto[1]) / 2.0; + if (std::imag (pole) < 0.0) + pole = std::conj (pole); + const auto pr = std::real (pole); + const auto pi = std::imag (pole); + + // a[] = 1 - 2 pr z + (pi^2 + pr^2) z^2 + a_coeffs_cplx[0] = 1.0; + a_coeffs_cplx[1] = -2.0 * pr; + a_coeffs_cplx[2] = pi * pi + pr * pr; + + // b_real[] = 1 - 2 pr z + (-pi^2 + pr^2) z^2 + b_coeffs_cplx_real[0] = 1.0; + b_coeffs_cplx_real[1] = -2.0 * pr; + b_coeffs_cplx_real[2] = -pi * pi + pr * pr; + + // b_imag[] = 2 pi z - 2 pi pr z^2 + b_coeffs_cplx_imag[0] = 0.0; + b_coeffs_cplx_imag[1] = 2.0 * pi; + b_coeffs_cplx_imag[2] = -2.0 * pi * pr; + + return center_target_freq; +} + +template +static void design_filter_bank (std::array, 2>& filter_bank, + double gamma, + double erb_start, + double q_ERB, + double sample_rate) +{ + for (size_t kiter = 0; kiter < ComplexERBFilterBank::num_filter_bands; ++kiter) + { + double b_coeffs_cplx_shared_double[3] {}; + double b_coeffs_cplx_real_double[3] {}; + double b_coeffs_cplx_imag_double[3] {}; + double a_coeffs_cplx_double[3] {}; + design_erb_filter (kiter, + gamma, + erb_start, + q_ERB, + sample_rate, + b_coeffs_cplx_shared_double, + b_coeffs_cplx_real_double, + b_coeffs_cplx_imag_double, + a_coeffs_cplx_double); + + for (auto& bank : filter_bank) + { + const auto k_div = kiter / T::size; + const auto k_off = kiter - (k_div * T::size); + + bank.erb_filter_complex[k_div].z_shared = {}; + bank.erb_filter_complex[k_div].z_real = {}; + bank.erb_filter_complex[k_div].z_imag = {}; + + for (size_t i = 0; i < 3; ++i) + { + reinterpret_cast (&bank.erb_filter_complex[k_div].b_shared_coeffs[i])[k_off] = static_cast (b_coeffs_cplx_shared_double[i]); + reinterpret_cast (&bank.erb_filter_complex[k_div].b_real_coeffs[i])[k_off] = static_cast (b_coeffs_cplx_real_double[i]); + reinterpret_cast (&bank.erb_filter_complex[k_div].b_imag_coeffs[i])[k_off] = static_cast (b_coeffs_cplx_imag_double[i]); + reinterpret_cast (&bank.erb_filter_complex[k_div].a_coeffs[i])[k_off] = static_cast (a_coeffs_cplx_double[i]); + } + } + } +} + +template +static void process (ComplexERBFilterBank& filter_bank, + const float* buffer_in, + float* buffer_out, + int num_samples) noexcept +{ + // buffer_out size is padded by 4x + static constexpr auto eps = std::numeric_limits::epsilon(); + auto* buffer_out_simd = juce::snapPointerToAlignment (reinterpret_cast (buffer_out), xsimd::default_arch::alignment()); + std::fill (buffer_out_simd, buffer_out_simd + num_samples, T {}); + for (size_t k = 0; k < N; k += T::size) + { + const auto filter_idx = k / T::size; + auto& cplx_filter = filter_bank.erb_filter_complex[filter_idx]; + chowdsp::ScopedValue z_shared { cplx_filter.z_shared }; + chowdsp::ScopedValue z_re { cplx_filter.z_real }; + chowdsp::ScopedValue z_im { cplx_filter.z_imag }; + for (int n = 0; n < num_samples; ++n) + { + const auto x_in = static_cast (buffer_in[n]); + const auto [x_re, x_im] = process_sample (x_in, + cplx_filter.b_shared_coeffs, + cplx_filter.b_real_coeffs, + cplx_filter.b_imag_coeffs, + cplx_filter.a_coeffs, + z_shared.get(), + z_re.get(), + z_im.get()); + + auto x_re_sq = x_re * x_re; + auto x_im_sq = x_im * x_im; + auto x_abs_sq = x_re_sq + x_im_sq; + + if constexpr (num_octaves_up == 1) + { + auto x_abs_r = xsimd::select (x_abs_sq > eps, xsimd::rsqrt (x_abs_sq), {}); + buffer_out_simd[n] += (x_re_sq - x_im_sq) * x_abs_r; + } + else if constexpr (num_octaves_up == 2) + { + auto x_abs_sq_r = xsimd::select (x_abs_sq > eps, xsimd::reciprocal (x_abs_sq), {}); + buffer_out_simd[n] += x_re * (x_re_sq - (S) 3 * x_im_sq) * x_abs_sq_r; + } + } + } + + for (int n = 0; n < num_samples; ++n) + buffer_out[n] = xsimd::reduce_add (buffer_out_simd[n]); + + static constexpr auto norm_gain = 2.0f / static_cast (N); + juce::FloatVectorOperations::multiply (buffer_out, norm_gain, num_samples); +} +} // namespace poly_octave_v2 diff --git a/src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h b/src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h new file mode 100644 index 00000000..fd195caf --- /dev/null +++ b/src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +namespace poly_octave_v1 +{ +using float_2 = xsimd::batch; +struct ComplexERBFilterBank +{ + static constexpr size_t numFilterBands = 44; + std::array, numFilterBands / float_2::size> erbFilterReal, erbFilterImag; +}; +} // namespace poly_octave_v1 + +namespace poly_octave_v2 +{ +using T = xsimd::batch; +using S = float; +static constexpr auto N1 = 32; +template +struct ComplexERBFilterBank +{ + static constexpr size_t num_filter_bands = N; + + struct ComplexFilter + { + std::array b_shared_coeffs {}; + std::array b_real_coeffs {}; + std::array b_imag_coeffs {}; + std::array a_coeffs {}; + std::array z_shared {}; + std::array z_real {}; + std::array z_imag {}; + }; + + std::array erb_filter_complex; +}; +} // namespace poly_octave_v2 From 6ef1720c830795b2fcc45b0ebbf0c217e18de5f3 Mon Sep 17 00:00:00 2001 From: jatinchowdhury18 Date: Fri, 17 May 2024 17:15:11 -0700 Subject: [PATCH 2/2] Poly octave SIMD optimizations (#358) * Setting up Poly Octave filterbank as a struct of arrays * SSE and AVX implementations * Fixing NEON up2 implementation * Fixing up2 processing on Intel * Apply clang-format * Missing include --------- Co-authored-by: github-actions[bot] --- modules/cmake/RuntimeSIMDLib.cmake | 9 +- src/CMakeLists.txt | 22 +- .../other/poly_octave/PolyOctave.cpp | 61 +- src/processors/other/poly_octave/PolyOctave.h | 7 +- .../poly_octave/PolyOctaveFilterBankTypes.h | 38 -- .../poly_octave/PolyOctaveV1FilterBank.h | 13 + ...Helpers.h => PolyOctaveV1FilterBankImpl.h} | 176 +---- .../poly_octave/PolyOctaveV2FilterBank.h | 36 ++ .../PolyOctaveV2FilterBankImpl.cpp | 609 ++++++++++++++++++ .../poly_octave/PolyOctaveV2FilterBankImpl.h | 28 + 10 files changed, 761 insertions(+), 238 deletions(-) delete mode 100644 src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h create mode 100644 src/processors/other/poly_octave/PolyOctaveV1FilterBank.h rename src/processors/other/poly_octave/{PolyOctaveFilterBandHelpers.h => PolyOctaveV1FilterBankImpl.h} (52%) create mode 100644 src/processors/other/poly_octave/PolyOctaveV2FilterBank.h create mode 100644 src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp create mode 100644 src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.h diff --git a/modules/cmake/RuntimeSIMDLib.cmake b/modules/cmake/RuntimeSIMDLib.cmake index 63239f4c..37cdc0be 100644 --- a/modules/cmake/RuntimeSIMDLib.cmake +++ b/modules/cmake/RuntimeSIMDLib.cmake @@ -1,10 +1,13 @@ include(CheckCXXCompilerFlag) -function(make_lib_simd_runtime name file) +function(make_lib_simd_runtime name) + set(multiValueArgs SOURCES) + cmake_parse_arguments(ARG "" "" "${multiValueArgs}" ${ARGN}) + add_library(${name}_sse_or_arm STATIC) - target_sources(${name}_sse_or_arm PRIVATE ${file}) + target_sources(${name}_sse_or_arm PRIVATE ${ARG_SOURCES}) add_library(${name}_avx STATIC) - target_sources(${name}_avx PRIVATE ${file}) + target_sources(${name}_avx PRIVATE ${ARG_SOURCES}) target_compile_definitions(${name}_avx PRIVATE BYOD_COMPILING_WITH_AVX=1) if(WIN32) CHECK_CXX_COMPILER_FLAG("/arch:AVX" COMPILER_OPT_ARCH_AVX_SUPPORTED) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 06ccbe4f..8b13ee76 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -167,9 +167,13 @@ if (NOT(${JAI_COMPILER} STREQUAL "JAI_COMPILER-NOTFOUND")) target_compile_definitions(BYOD PRIVATE BYOD_BUILDING_JAI_MODULES=1) endif() -# AVX/SSE files for accelerated neural nets -make_lib_simd_runtime(rnn_accelerated processors/drive/neural_utils/RNNAccelerated.cpp) -foreach(target IN ITEMS rnn_accelerated_sse_or_arm rnn_accelerated_avx) +# AVX/SSE files for accelerated neural nets and other DSP +make_lib_simd_runtime(dsp_accelerated + SOURCES + processors/drive/neural_utils/RNNAccelerated.cpp + processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp +) +foreach(target IN ITEMS dsp_accelerated_sse_or_arm dsp_accelerated_avx) target_link_libraries(${target} PRIVATE math_approx @@ -182,7 +186,11 @@ foreach(target IN ITEMS rnn_accelerated_sse_or_arm rnn_accelerated_avx) ${CMAKE_CURRENT_SOURCE_DIR}/../modules/RTNeural ${CMAKE_CURRENT_SOURCE_DIR}/../modules/RTNeural/modules/xsimd/include ) - target_compile_definitions(${target} PRIVATE RTNEURAL_USE_XSIMD=1) + target_compile_definitions(${target} + PRIVATE + RTNEURAL_USE_XSIMD=1 + _USE_MATH_DEFINES=1 + ) set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE TRUE VISIBILITY_INLINES_HIDDEN TRUE @@ -190,9 +198,9 @@ foreach(target IN ITEMS rnn_accelerated_sse_or_arm rnn_accelerated_avx) CXX_VISIBILITY_PRESET hidden ) endforeach() -target_compile_definitions(rnn_accelerated_sse_or_arm PRIVATE RTNEURAL_DEFAULT_ALIGNMENT=16 RTNEURAL_NAMESPACE=RTNeural_sse_arm) -target_compile_definitions(rnn_accelerated_avx PRIVATE RTNEURAL_DEFAULT_ALIGNMENT=32 RTNEURAL_NAMESPACE=RTNeural_avx) -target_link_libraries(BYOD PRIVATE rnn_accelerated) +target_compile_definitions(dsp_accelerated_sse_or_arm PRIVATE RTNEURAL_DEFAULT_ALIGNMENT=16 RTNEURAL_NAMESPACE=RTNeural_sse_arm) +target_compile_definitions(dsp_accelerated_avx PRIVATE RTNEURAL_DEFAULT_ALIGNMENT=32 RTNEURAL_NAMESPACE=RTNeural_avx) +target_link_libraries(BYOD PRIVATE dsp_accelerated) # special flags for MSVC if (MSVC) diff --git a/src/processors/other/poly_octave/PolyOctave.cpp b/src/processors/other/poly_octave/PolyOctave.cpp index 3057c175..cebfb531 100644 --- a/src/processors/other/poly_octave/PolyOctave.cpp +++ b/src/processors/other/poly_octave/PolyOctave.cpp @@ -1,5 +1,6 @@ #include "PolyOctave.h" -#include "PolyOctaveFilterBandHelpers.h" +#include "PolyOctaveV1FilterBankImpl.h" +#include "PolyOctaveV2FilterBankImpl.h" #include "processors/ParameterHelpers.h" namespace PolyOctaveTags @@ -42,6 +43,14 @@ PolyOctave::PolyOctave (UndoManager* um) uiOptions.powerColour = Colour { 0xffe70510 }; uiOptions.info.description = "A \"polyphonic octave generator\" effect."; uiOptions.info.authors = StringArray { "Jatin Chowdhury" }; + +#if JUCE_INTEL + if (juce::SystemStats::hasAVX() && juce::SystemStats::hasFMA3()) + { + juce::Logger::writeToLog ("Using Poly Octave with AVX SIMD instructions!"); + use_avx = true; + } +#endif } ParamLayout PolyOctave::createParameterLayout() @@ -94,9 +103,9 @@ void PolyOctave::prepare (double sampleRate, int samplesPerBlock) } mixOutBuffer.setSize (2, samplesPerBlock); - up1OutBuffer.setSize (2, 4 * samplesPerBlock + 8); // padding for SIMD - up2OutBuffer.setSize (2, 4 * samplesPerBlock + 8); // padding for SIMD - down1OutBuffer.setSize (2, 4 * samplesPerBlock + 8); // padding for SIMD + up1OutBuffer.setSize (2, 8 * samplesPerBlock + 32); // padding for SIMD + up2OutBuffer.setSize (2, 8 * samplesPerBlock + 32); // padding for SIMD + down1OutBuffer.setSize (2, samplesPerBlock); } void PolyOctave::processAudio (AudioBuffer& buffer) @@ -125,23 +134,47 @@ void PolyOctave::processAudio (AudioBuffer& buffer) chowdsp::BufferMath::applyGainSmoothedBuffer (down1OutBuffer, downOctaveGain); // "up1" processing - for (auto [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (buffer), up1OutBuffer)) + for (const auto& [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (buffer), up1OutBuffer)) { - poly_octave_v2::process<1> (octaveUpFilterBank[ch], - data_in.data(), - data_out.data(), - numSamples); +#if JUCE_INTEL + if (use_avx) + { + poly_octave_v2::process_avx<1> (octaveUpFilterBank[ch], + data_in.data(), + data_out.data(), + numSamples); + } + else +#endif + { + poly_octave_v2::process<1> (octaveUpFilterBank[ch], + data_in.data(), + data_out.data(), + numSamples); + } } upOctaveGain.process (numSamples); chowdsp::BufferMath::applyGainSmoothedBuffer (up1OutBuffer, upOctaveGain); // "up2" processing - for (auto [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (buffer), up2OutBuffer)) + for (const auto& [ch, data_in, data_out] : chowdsp::buffer_iters::zip_channels (std::as_const (buffer), up2OutBuffer)) { - poly_octave_v2::process<2> (octaveUp2FilterBank[ch], - data_in.data(), - data_out.data(), - numSamples); +#if JUCE_INTEL + if (use_avx) + { + poly_octave_v2::process_avx<2> (octaveUp2FilterBank[ch], + data_in.data(), + data_out.data(), + numSamples); + } + else +#endif + { + poly_octave_v2::process<2> (octaveUp2FilterBank[ch], + data_in.data(), + data_out.data(), + numSamples); + } } up2OctaveGain.process (numSamples); chowdsp::BufferMath::applyGainSmoothedBuffer (up2OutBuffer, up2OctaveGain); diff --git a/src/processors/other/poly_octave/PolyOctave.h b/src/processors/other/poly_octave/PolyOctave.h index 2ef85d2c..aad5148e 100644 --- a/src/processors/other/poly_octave/PolyOctave.h +++ b/src/processors/other/poly_octave/PolyOctave.h @@ -1,7 +1,8 @@ #pragma once #include "DelayPitchShifter.h" -#include "PolyOctaveFilterBankTypes.h" +#include "PolyOctaveV1FilterBank.h" +#include "PolyOctaveV2FilterBank.h" #include "processors/BaseProcessor.h" class PolyOctave : public BaseProcessor @@ -61,5 +62,9 @@ class PolyOctave : public BaseProcessor juce::AudioBuffer up2OutBuffer; juce::AudioBuffer down1OutBuffer; +#if JUCE_INTEL + bool use_avx = false; +#endif + JUCE_DECLARE_NON_COPYABLE_WITH_LEAK_DETECTOR (PolyOctave) }; diff --git a/src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h b/src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h deleted file mode 100644 index fd195caf..00000000 --- a/src/processors/other/poly_octave/PolyOctaveFilterBankTypes.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include - -namespace poly_octave_v1 -{ -using float_2 = xsimd::batch; -struct ComplexERBFilterBank -{ - static constexpr size_t numFilterBands = 44; - std::array, numFilterBands / float_2::size> erbFilterReal, erbFilterImag; -}; -} // namespace poly_octave_v1 - -namespace poly_octave_v2 -{ -using T = xsimd::batch; -using S = float; -static constexpr auto N1 = 32; -template -struct ComplexERBFilterBank -{ - static constexpr size_t num_filter_bands = N; - - struct ComplexFilter - { - std::array b_shared_coeffs {}; - std::array b_real_coeffs {}; - std::array b_imag_coeffs {}; - std::array a_coeffs {}; - std::array z_shared {}; - std::array z_real {}; - std::array z_imag {}; - }; - - std::array erb_filter_complex; -}; -} // namespace poly_octave_v2 diff --git a/src/processors/other/poly_octave/PolyOctaveV1FilterBank.h b/src/processors/other/poly_octave/PolyOctaveV1FilterBank.h new file mode 100644 index 00000000..d7a16fb0 --- /dev/null +++ b/src/processors/other/poly_octave/PolyOctaveV1FilterBank.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace poly_octave_v1 +{ +using float_2 = xsimd::batch; +struct ComplexERBFilterBank +{ + static constexpr size_t numFilterBands = 44; + std::array, numFilterBands / float_2::size> erbFilterReal, erbFilterImag; +}; +} // namespace poly_octave_v1 diff --git a/src/processors/other/poly_octave/PolyOctaveFilterBandHelpers.h b/src/processors/other/poly_octave/PolyOctaveV1FilterBankImpl.h similarity index 52% rename from src/processors/other/poly_octave/PolyOctaveFilterBandHelpers.h rename to src/processors/other/poly_octave/PolyOctaveV1FilterBankImpl.h index e1d8553b..2b3616ef 100644 --- a/src/processors/other/poly_octave/PolyOctaveFilterBandHelpers.h +++ b/src/processors/other/poly_octave/PolyOctaveV1FilterBankImpl.h @@ -1,6 +1,6 @@ #pragma once -#include "PolyOctaveFilterBankTypes.h" +#include "PolyOctaveV1FilterBank.h" namespace poly_octave_v1 { @@ -160,177 +160,3 @@ static void designFilterBank (std::array& filterBank, } } } // namespace poly_octave_v1 - -namespace poly_octave_v2 -{ -// Reference for filter-bank design and octave shifting: -// https://aaltodoc.aalto.fi/server/api/core/bitstreams/ff9e52cf-fd79-45eb-b695-93038244ec0e/content - -inline std::pair process_sample (const T& x, - const std::array& b_shared_coeffs, - const std::array& b_real_coeffs, - const std::array& b_imag_coeffs, - const std::array& a_coeffs, - std::array& z_shared, - std::array& z_real, - std::array& z_imag) -{ - const auto y_shared = z_shared[1] + x * b_shared_coeffs[0]; - z_shared[1] = z_shared[2] + x * b_shared_coeffs[1] - y_shared * a_coeffs[1]; - z_shared[2] = x * b_shared_coeffs[2] - y_shared * a_coeffs[2]; - - const auto y_real = z_real[1] + y_shared; // for the real filter, we know that b[0] == 1 - z_real[1] = z_real[2] + y_shared * b_real_coeffs[1] - y_real * a_coeffs[1]; - z_real[2] = y_shared * b_real_coeffs[2] - y_real * a_coeffs[2]; - - const auto y_imag = z_imag[1]; // for the imaginary filter, we know that b[0] == 0 - z_imag[1] = z_imag[2] + y_shared * b_imag_coeffs[1] - y_imag * a_coeffs[1]; - z_imag[2] = y_shared * b_imag_coeffs[2] - y_imag * a_coeffs[2]; - - return { y_real, y_imag }; -} - -static constexpr auto q_c = 4.0; -static auto design_erb_filter (size_t erb_index, - double gamma, - double erb_start, - double q_ERB, - double sample_rate, - double (&b_coeffs_cplx_shared)[3], - double (&b_coeffs_cplx_real)[3], - double (&b_coeffs_cplx_imag)[3], - double (&a_coeffs_cplx)[3]) -{ - const auto q_PS = gamma; - - const auto z = erb_start + static_cast (erb_index) * (q_c / q_ERB); - const auto center_target_freq = 228.7 * (std::pow (10.0, z / 21.3) - 1.0); - const auto filter_q = (1.0 / (q_PS * q_ERB)) * (24.7 + 0.108 * center_target_freq); - - double a_coeffs_proto[3]; - chowdsp::CoefficientCalculators::calcSecondOrderBPF (b_coeffs_cplx_shared, - a_coeffs_proto, - center_target_freq / gamma, - filter_q * 0.5, - sample_rate); - - auto pole = (std::sqrt (std::pow (std::complex { a_coeffs_proto[1] }, 2.0) - 4.0 * a_coeffs_proto[2]) - a_coeffs_proto[1]) / 2.0; - if (std::imag (pole) < 0.0) - pole = std::conj (pole); - const auto pr = std::real (pole); - const auto pi = std::imag (pole); - - // a[] = 1 - 2 pr z + (pi^2 + pr^2) z^2 - a_coeffs_cplx[0] = 1.0; - a_coeffs_cplx[1] = -2.0 * pr; - a_coeffs_cplx[2] = pi * pi + pr * pr; - - // b_real[] = 1 - 2 pr z + (-pi^2 + pr^2) z^2 - b_coeffs_cplx_real[0] = 1.0; - b_coeffs_cplx_real[1] = -2.0 * pr; - b_coeffs_cplx_real[2] = -pi * pi + pr * pr; - - // b_imag[] = 2 pi z - 2 pi pr z^2 - b_coeffs_cplx_imag[0] = 0.0; - b_coeffs_cplx_imag[1] = 2.0 * pi; - b_coeffs_cplx_imag[2] = -2.0 * pi * pr; - - return center_target_freq; -} - -template -static void design_filter_bank (std::array, 2>& filter_bank, - double gamma, - double erb_start, - double q_ERB, - double sample_rate) -{ - for (size_t kiter = 0; kiter < ComplexERBFilterBank::num_filter_bands; ++kiter) - { - double b_coeffs_cplx_shared_double[3] {}; - double b_coeffs_cplx_real_double[3] {}; - double b_coeffs_cplx_imag_double[3] {}; - double a_coeffs_cplx_double[3] {}; - design_erb_filter (kiter, - gamma, - erb_start, - q_ERB, - sample_rate, - b_coeffs_cplx_shared_double, - b_coeffs_cplx_real_double, - b_coeffs_cplx_imag_double, - a_coeffs_cplx_double); - - for (auto& bank : filter_bank) - { - const auto k_div = kiter / T::size; - const auto k_off = kiter - (k_div * T::size); - - bank.erb_filter_complex[k_div].z_shared = {}; - bank.erb_filter_complex[k_div].z_real = {}; - bank.erb_filter_complex[k_div].z_imag = {}; - - for (size_t i = 0; i < 3; ++i) - { - reinterpret_cast (&bank.erb_filter_complex[k_div].b_shared_coeffs[i])[k_off] = static_cast (b_coeffs_cplx_shared_double[i]); - reinterpret_cast (&bank.erb_filter_complex[k_div].b_real_coeffs[i])[k_off] = static_cast (b_coeffs_cplx_real_double[i]); - reinterpret_cast (&bank.erb_filter_complex[k_div].b_imag_coeffs[i])[k_off] = static_cast (b_coeffs_cplx_imag_double[i]); - reinterpret_cast (&bank.erb_filter_complex[k_div].a_coeffs[i])[k_off] = static_cast (a_coeffs_cplx_double[i]); - } - } - } -} - -template -static void process (ComplexERBFilterBank& filter_bank, - const float* buffer_in, - float* buffer_out, - int num_samples) noexcept -{ - // buffer_out size is padded by 4x - static constexpr auto eps = std::numeric_limits::epsilon(); - auto* buffer_out_simd = juce::snapPointerToAlignment (reinterpret_cast (buffer_out), xsimd::default_arch::alignment()); - std::fill (buffer_out_simd, buffer_out_simd + num_samples, T {}); - for (size_t k = 0; k < N; k += T::size) - { - const auto filter_idx = k / T::size; - auto& cplx_filter = filter_bank.erb_filter_complex[filter_idx]; - chowdsp::ScopedValue z_shared { cplx_filter.z_shared }; - chowdsp::ScopedValue z_re { cplx_filter.z_real }; - chowdsp::ScopedValue z_im { cplx_filter.z_imag }; - for (int n = 0; n < num_samples; ++n) - { - const auto x_in = static_cast (buffer_in[n]); - const auto [x_re, x_im] = process_sample (x_in, - cplx_filter.b_shared_coeffs, - cplx_filter.b_real_coeffs, - cplx_filter.b_imag_coeffs, - cplx_filter.a_coeffs, - z_shared.get(), - z_re.get(), - z_im.get()); - - auto x_re_sq = x_re * x_re; - auto x_im_sq = x_im * x_im; - auto x_abs_sq = x_re_sq + x_im_sq; - - if constexpr (num_octaves_up == 1) - { - auto x_abs_r = xsimd::select (x_abs_sq > eps, xsimd::rsqrt (x_abs_sq), {}); - buffer_out_simd[n] += (x_re_sq - x_im_sq) * x_abs_r; - } - else if constexpr (num_octaves_up == 2) - { - auto x_abs_sq_r = xsimd::select (x_abs_sq > eps, xsimd::reciprocal (x_abs_sq), {}); - buffer_out_simd[n] += x_re * (x_re_sq - (S) 3 * x_im_sq) * x_abs_sq_r; - } - } - } - - for (int n = 0; n < num_samples; ++n) - buffer_out[n] = xsimd::reduce_add (buffer_out_simd[n]); - - static constexpr auto norm_gain = 2.0f / static_cast (N); - juce::FloatVectorOperations::multiply (buffer_out, norm_gain, num_samples); -} -} // namespace poly_octave_v2 diff --git a/src/processors/other/poly_octave/PolyOctaveV2FilterBank.h b/src/processors/other/poly_octave/PolyOctaveV2FilterBank.h new file mode 100644 index 00000000..c84ef212 --- /dev/null +++ b/src/processors/other/poly_octave/PolyOctaveV2FilterBank.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +namespace poly_octave_v2 +{ +static constexpr auto N1 = 32; +template +struct ComplexERBFilterBank +{ + static constexpr size_t num_filter_bands = N; + + alignas (32) std::array a1 {}; + alignas (32) std::array a2 {}; + + alignas (32) std::array shared_b0 {}; + alignas (32) std::array shared_b1 {}; + alignas (32) std::array shared_b2 {}; + + alignas (32) std::array real_b1 {}; + alignas (32) std::array real_b2 {}; + + alignas (32) std::array imag_b1 {}; + alignas (32) std::array imag_b2 {}; + + alignas (32) std::array shared_z1 {}; + alignas (32) std::array shared_z2 {}; + + alignas (32) std::array real_z1 {}; + alignas (32) std::array real_z2 {}; + + alignas (32) std::array imag_z1 {}; + alignas (32) std::array imag_z2 {}; +}; +} // namespace poly_octave_v2 diff --git a/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp b/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp new file mode 100644 index 00000000..b3575d9b --- /dev/null +++ b/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.cpp @@ -0,0 +1,609 @@ +#include "PolyOctaveV2FilterBankImpl.h" + +#include +#include + +#if defined(__AVX__) +#include +#elif defined(__SSE2__) +#include +#elif defined(__ARM_NEON__) +#include +#else +#warning "Compiling poly octave DSP without SIMD optimization" +#endif + +namespace poly_octave_v2 +{ +// Reference for filter-bank design and octave shifting: +// https://aaltodoc.aalto.fi/server/api/core/bitstreams/ff9e52cf-fd79-45eb-b695-93038244ec0e/content + +#if defined(__ARM_NEON__) +inline std::pair process_sample_neon (float32x4_t x, + float32x4_t a1, + float32x4_t a2, + float32x4_t shared_b0, + float32x4_t shared_b1, + float32x4_t shared_b2, + float32x4_t real_b1, + float32x4_t real_b2, + float32x4_t imag_b1, + float32x4_t imag_b2, + float32x4_t& shared_z1, + float32x4_t& shared_z2, + float32x4_t& real_z1, + float32x4_t& real_z2, + float32x4_t& imag_z1, + float32x4_t& imag_z2) +{ + const auto y_shared = vfmaq_f32 (shared_z1, x, shared_b0); + shared_z1 = vfmsq_f32 (vfmaq_f32 (shared_z2, x, shared_b1), y_shared, a1); + shared_z2 = x * shared_b2 - y_shared * a2; + + const auto y_real = vaddq_f32 (real_z1, y_shared); // for the real filter, we know that b[0] == 1 + real_z1 = vfmsq_f32 (vfmaq_f32 (real_z2, y_shared, real_b1), y_real, a1); + real_z2 = vfmsq_f32 (vmulq_f32 (y_shared, real_b2), y_real, a2); + + const auto y_imag = imag_z1; // for the imaginary filter, we know that b[0] == 0 + imag_z1 = vfmsq_f32 (vfmaq_f32 (imag_z2, y_shared, imag_b1), y_imag, a1); + imag_z2 = vfmsq_f32 (vmulq_f32 (y_shared, imag_b2), y_imag, a2); + + return { y_real, y_imag }; +} +#elif defined(__AVX__) || defined(__SSE2__) +#if defined(__AVX__) +inline std::pair<__m256, __m256> process_sample_avx (__m256 x, + __m256 a1, + __m256 a2, + __m256 shared_b0, + __m256 shared_b1, + __m256 shared_b2, + __m256 real_b1, + __m256 real_b2, + __m256 imag_b1, + __m256 imag_b2, + __m256& shared_z1, + __m256& shared_z2, + __m256& real_z1, + __m256& real_z2, + __m256& imag_z1, + __m256& imag_z2) +{ + const auto y_shared = _mm256_add_ps (shared_z1, _mm256_mul_ps (x, shared_b0)); + shared_z1 = _mm256_add_ps (shared_z2, _mm256_sub_ps (_mm256_mul_ps (x, shared_b1), _mm256_mul_ps (y_shared, a1))); + shared_z2 = _mm256_sub_ps (_mm256_mul_ps (x, shared_b2), _mm256_mul_ps (y_shared, a2)); + + const auto y_real = _mm256_add_ps (real_z1, y_shared); // for the real filter, we know that b[0] == 1 + real_z1 = _mm256_add_ps (real_z2, _mm256_sub_ps (_mm256_mul_ps (y_shared, real_b1), _mm256_mul_ps (y_real, a1))); + real_z2 = _mm256_sub_ps (_mm256_mul_ps (y_shared, real_b2), _mm256_mul_ps (y_real, a2)); + + const auto y_imag = imag_z1; // for the imaginary filter, we know that b[0] == 0 + imag_z1 = _mm256_add_ps (imag_z2, _mm256_sub_ps (_mm256_mul_ps (y_shared, imag_b1), _mm256_mul_ps (y_imag, a1))); + imag_z2 = _mm256_sub_ps (_mm256_mul_ps (y_shared, imag_b2), _mm256_mul_ps (y_imag, a2)); + + return { y_real, y_imag }; +} +#endif +inline std::pair<__m128, __m128> process_sample_sse (__m128 x, + __m128 a1, + __m128 a2, + __m128 shared_b0, + __m128 shared_b1, + __m128 shared_b2, + __m128 real_b1, + __m128 real_b2, + __m128 imag_b1, + __m128 imag_b2, + __m128& shared_z1, + __m128& shared_z2, + __m128& real_z1, + __m128& real_z2, + __m128& imag_z1, + __m128& imag_z2) +{ + const auto y_shared = _mm_add_ps (shared_z1, _mm_mul_ps (x, shared_b0)); + shared_z1 = _mm_add_ps (shared_z2, _mm_sub_ps (_mm_mul_ps (x, shared_b1), _mm_mul_ps (y_shared, a1))); + shared_z2 = _mm_sub_ps (_mm_mul_ps (x, shared_b2), _mm_mul_ps (y_shared, a2)); + + const auto y_real = _mm_add_ps (real_z1, y_shared); // for the real filter, we know that b[0] == 1 + real_z1 = _mm_add_ps (real_z2, _mm_sub_ps (_mm_mul_ps (y_shared, real_b1), _mm_mul_ps (y_real, a1))); + real_z2 = _mm_sub_ps (_mm_mul_ps (y_shared, real_b2), _mm_mul_ps (y_real, a2)); + + const auto y_imag = imag_z1; // for the imaginary filter, we know that b[0] == 0 + imag_z1 = _mm_add_ps (imag_z2, _mm_sub_ps (_mm_mul_ps (y_shared, imag_b1), _mm_mul_ps (y_imag, a1))); + imag_z2 = _mm_sub_ps (_mm_mul_ps (y_shared, imag_b2), _mm_mul_ps (y_imag, a2)); + + return { y_real, y_imag }; +} +#else +inline std::pair process_sample_soa (float x, + float a1, + float a2, + float shared_b0, + float shared_b1, + float shared_b2, + float real_b1, + float real_b2, + float imag_b1, + float imag_b2, + float& shared_z1, + float& shared_z2, + float& real_z1, + float& real_z2, + float& imag_z1, + float& imag_z2) +{ + const auto y_shared = shared_z1 + x * shared_b0; + shared_z1 = shared_z2 + x * shared_b1 - y_shared * a1; + shared_z2 = x * shared_b2 - y_shared * a2; + + const auto y_real = real_z1 + y_shared; // for the real filter, we know that b[0] == 1 + real_z1 = real_z2 + y_shared * real_b1 - y_real * a1; + real_z2 = y_shared * real_b2 - y_real * a2; + + const auto y_imag = imag_z1; // for the imaginary filter, we know that b[0] == 0 + imag_z1 = imag_z2 + y_shared * imag_b1 - y_imag * a1; + imag_z2 = y_shared * imag_b2 - y_imag * a2; + + return { y_real, y_imag }; +} +#endif + +/** Computes the warping factor "K" so that the angular frequency wc is matched at sample rate fs */ +template +inline T computeKValueAngular (T wc, NumericType fs) +{ + return wc / std::tan (wc / ((NumericType) 2 * fs)); +} + +/** Second-order bilinear transform */ +template +static void bilinear (T (&b)[3], T (&a)[3], const T (&bs)[3], const T (&as)[3], T K) +{ + // the mobius transform adds a little computational overhead, so here's the optimized bilinear transform. + const auto KSq = K * K; + + const auto as0_KSq = as[0] * KSq; + const auto as1_K = as[1] * K; + const auto bs0_KSq = bs[0] * KSq; + const auto bs1_K = bs[1] * K; + + const auto a0_inv = (T) 1 / (as0_KSq + as1_K + as[2]); + + a[0] = (T) 1; + a[1] = (T) 2 * (as[2] - as0_KSq) * a0_inv; + a[2] = (as0_KSq - as1_K + as[2]) * a0_inv; + b[0] = (bs0_KSq + bs1_K + bs[2]) * a0_inv; + b[1] = (T) 2 * (bs[2] - bs0_KSq) * a0_inv; + b[2] = (bs0_KSq - bs1_K + bs[2]) * a0_inv; +} + +/** + * Calculates the filter coefficients for a given cutoff frequency, Q value, and sample rate. + * The analog prototype transfer function is: \f$ H(s) = \frac{s/Q}{s^2 + s/Q + 1} \f$ + */ +template +void calcSecondOrderBPF (T (&b)[3], T (&a)[3], T fc, T qVal, NumericType fs) +{ + const auto wc = (2.0f * static_cast (M_PI)) * fc; + const auto K = computeKValueAngular (wc, fs); + auto kSqTerm = (T) 1 / (wc * wc); + auto kTerm = (T) 1 / (qVal * wc); + bilinear (b, a, { (T) 0, kTerm, (T) 0 }, { kSqTerm, kTerm, (T) 1 }, K); +} + +static constexpr auto q_c = 4.0; +[[maybe_unused]] static auto design_erb_filter (size_t erb_index, + double gamma, + double erb_start, + double q_ERB, + double sample_rate, + double (&b_coeffs_cplx_shared)[3], + double (&b_coeffs_cplx_real)[3], + double (&b_coeffs_cplx_imag)[3], + double (&a_coeffs_cplx)[3]) +{ + const auto q_PS = gamma; + + const auto z = erb_start + static_cast (erb_index) * (q_c / q_ERB); + const auto center_target_freq = 228.7 * (std::pow (10.0, z / 21.3) - 1.0); + const auto filter_q = (1.0 / (q_PS * q_ERB)) * (24.7 + 0.108 * center_target_freq); + + double a_coeffs_proto[3]; + calcSecondOrderBPF (b_coeffs_cplx_shared, + a_coeffs_proto, + center_target_freq / gamma, + filter_q * 0.5, + sample_rate); + + auto pole = (std::sqrt (std::pow (std::complex { a_coeffs_proto[1] }, 2.0) - 4.0 * a_coeffs_proto[2]) - a_coeffs_proto[1]) / 2.0; + if (std::imag (pole) < 0.0) + pole = std::conj (pole); + const auto pr = std::real (pole); + const auto pi = std::imag (pole); + + // a[] = 1 - 2 pr z + (pi^2 + pr^2) z^2 + a_coeffs_cplx[0] = 1.0; + a_coeffs_cplx[1] = -2.0 * pr; + a_coeffs_cplx[2] = pi * pi + pr * pr; + + // b_real[] = 1 - 2 pr z + (-pi^2 + pr^2) z^2 + b_coeffs_cplx_real[0] = 1.0; + b_coeffs_cplx_real[1] = -2.0 * pr; + b_coeffs_cplx_real[2] = -pi * pi + pr * pr; + + // b_imag[] = 2 pi z - 2 pi pr z^2 + b_coeffs_cplx_imag[0] = 0.0; + b_coeffs_cplx_imag[1] = 2.0 * pi; + b_coeffs_cplx_imag[2] = -2.0 * pi * pr; + + return center_target_freq; +} + +template +void design_filter_bank (std::array, 2>& filter_bank, + double gamma, + double erb_start, + double q_ERB, + double sample_rate) +{ + for (auto& bank : filter_bank) + { + bank.shared_z1 = {}; + bank.shared_z2 = {}; + bank.imag_z1 = {}; + bank.imag_z2 = {}; + bank.real_z1 = {}; + bank.real_z2 = {}; + } + + for (size_t kiter = 0; kiter < ComplexERBFilterBank::num_filter_bands; ++kiter) + { + double b_coeffs_cplx_shared_double[3] {}; + double b_coeffs_cplx_real_double[3] {}; + double b_coeffs_cplx_imag_double[3] {}; + double a_coeffs_cplx_double[3] {}; + design_erb_filter (kiter, + gamma, + erb_start, + q_ERB, + sample_rate, + b_coeffs_cplx_shared_double, + b_coeffs_cplx_real_double, + b_coeffs_cplx_imag_double, + a_coeffs_cplx_double); + + for (auto& bank : filter_bank) + { + bank.a1[kiter] = static_cast (a_coeffs_cplx_double[1]); + bank.a2[kiter] = static_cast (a_coeffs_cplx_double[2]); + bank.shared_b0[kiter] = static_cast (b_coeffs_cplx_shared_double[0]); + bank.shared_b1[kiter] = static_cast (b_coeffs_cplx_shared_double[1]); + bank.shared_b2[kiter] = static_cast (b_coeffs_cplx_shared_double[2]); + bank.real_b1[kiter] = static_cast (b_coeffs_cplx_real_double[1]); + bank.real_b2[kiter] = static_cast (b_coeffs_cplx_real_double[2]); + bank.imag_b1[kiter] = static_cast (b_coeffs_cplx_imag_double[1]); + bank.imag_b2[kiter] = static_cast (b_coeffs_cplx_imag_double[2]); + } + } +} + +/** + * Returns the next pointer with a given byte alignment, + * or the base pointer if it is already aligned. + */ +template +Type* snapPointerToAlignment (Type* basePointer, + IntegerType alignmentBytes) noexcept +{ + return (Type*) ((((size_t) basePointer) + (alignmentBytes - 1)) & ~(alignmentBytes - 1)); +} + +template +void process (ComplexERBFilterBank& filter_bank, + const float* buffer_in, + float* buffer_out, + int num_samples) noexcept +{ + // buffer_out size is padded by 4x + static constexpr auto eps = std::numeric_limits::epsilon(); + static constexpr auto norm_gain = 2.0f / static_cast (N); + +#if defined(__ARM_NEON__) + auto* buffer_out_simd = reinterpret_cast (snapPointerToAlignment (buffer_out, 16)); + std::fill (buffer_out_simd, buffer_out_simd + num_samples, float32x4_t {}); + + const auto eps_neon = vld1q_dup_f32 (&eps); + const auto norm_gain_neon = vld1q_dup_f32 (&norm_gain); + const auto zero_neon = float32x4_t {}; + for (size_t k = 0; k < N; k += 4) + { + const auto a1 = vld1q_f32 (filter_bank.a1.data() + k); + const auto a2 = vld1q_f32 (filter_bank.a2.data() + k); + const auto shared_b0 = vld1q_f32 (filter_bank.shared_b0.data() + k); + const auto shared_b1 = vld1q_f32 (filter_bank.shared_b1.data() + k); + const auto shared_b2 = vld1q_f32 (filter_bank.shared_b2.data() + k); + const auto real_b1 = vld1q_f32 (filter_bank.real_b1.data() + k); + const auto real_b2 = vld1q_f32 (filter_bank.real_b2.data() + k); + const auto imag_b1 = vld1q_f32 (filter_bank.imag_b1.data() + k); + const auto imag_b2 = vld1q_f32 (filter_bank.imag_b2.data() + k); + + auto shared_z1 = vld1q_f32 (filter_bank.shared_z1.data() + k); + auto shared_z2 = vld1q_f32 (filter_bank.shared_z2.data() + k); + auto real_z1 = vld1q_f32 (filter_bank.real_z1.data() + k); + auto real_z2 = vld1q_f32 (filter_bank.real_z2.data() + k); + auto imag_z1 = vld1q_f32 (filter_bank.imag_z1.data() + k); + auto imag_z2 = vld1q_f32 (filter_bank.imag_z2.data() + k); + + for (int n = 0; n < num_samples; ++n) + { + const auto x_in = vld1q_dup_f32 (buffer_in + n); + const auto [x_re, x_im] = process_sample_neon (x_in, a1, a2, shared_b0, shared_b1, shared_b2, real_b1, real_b2, imag_b1, imag_b2, shared_z1, shared_z2, real_z1, real_z2, imag_z1, imag_z2); + + auto x_re_sq = vmulq_f32 (x_re, x_re); + auto x_im_sq = vmulq_f32 (x_im, x_im); + auto x_abs_sq = vaddq_f32 (x_re_sq, x_im_sq); + + if constexpr (num_octaves_up == 1) + { + const auto greater_than_eps = vcgtq_f32 (x_abs_sq, eps_neon); + const auto sqrt = vrsqrteq_f32 (x_abs_sq); + const auto x_abs_r = vbslq_f32 (greater_than_eps, sqrt, zero_neon); + buffer_out_simd[n] = vfmaq_f32 (buffer_out_simd[n], vsubq_f32 (x_re_sq, x_im_sq), x_abs_r); + } + else if constexpr (num_octaves_up == 2) + { + const auto greater_than_eps = vcgtq_f32 (x_abs_sq, eps_neon); + const auto x_abs_r = vbslq_f32 (greater_than_eps, vrecpeq_f32 (x_abs_sq), zero_neon); + const auto x_im_sq_x3 = vaddq_f32 (x_im_sq, vaddq_f32 (x_im_sq, x_im_sq)); + buffer_out_simd[n] = vfmaq_f32 (buffer_out_simd[n], vsubq_f32 (x_re_sq, x_im_sq_x3), vmulq_f32 (x_re, x_abs_r)); + + // @TODO + // auto x_abs_sq_r = xsimd::select (x_abs_sq > eps, xsimd::reciprocal (x_abs_sq), {}); + // buffer_out_simd[n] += x_re * (x_re_sq - (S) 3 * x_im_sq) * x_abs_sq_r; + } + } + + vst1q_f32 (filter_bank.shared_z1.data() + k, shared_z1); + vst1q_f32 (filter_bank.shared_z2.data() + k, shared_z2); + vst1q_f32 (filter_bank.real_z1.data() + k, real_z1); + vst1q_f32 (filter_bank.real_z2.data() + k, real_z2); + vst1q_f32 (filter_bank.imag_z1.data() + k, imag_z1); + vst1q_f32 (filter_bank.imag_z2.data() + k, imag_z2); + } + + for (int n = 0; n < num_samples; ++n) + { + buffer_out_simd[n] = vmulq_f32 (norm_gain_neon, buffer_out_simd[n]); + auto rr = vadd_f32 (vget_high_f32 (buffer_out_simd[n]), vget_low_f32 (buffer_out_simd[n])); + buffer_out[n] = vget_lane_f32 (vpadd_f32 (rr, rr), 0); + } +#elif defined(__AVX__) || defined(__SSE2__) + auto* buffer_out_simd = reinterpret_cast<__m128*> (snapPointerToAlignment (buffer_out, 16)); + std::fill (buffer_out_simd, buffer_out_simd + num_samples, __m128 {}); + + const auto eps_sse = _mm_set1_ps (eps); + const auto norm_gain_sse = _mm_set1_ps (norm_gain); + for (size_t k = 0; k < N; k += 4) + { + const auto a1 = _mm_load_ps (filter_bank.a1.data() + k); + const auto a2 = _mm_load_ps (filter_bank.a2.data() + k); + const auto shared_b0 = _mm_load_ps (filter_bank.shared_b0.data() + k); + const auto shared_b1 = _mm_load_ps (filter_bank.shared_b1.data() + k); + const auto shared_b2 = _mm_load_ps (filter_bank.shared_b2.data() + k); + const auto real_b1 = _mm_load_ps (filter_bank.real_b1.data() + k); + const auto real_b2 = _mm_load_ps (filter_bank.real_b2.data() + k); + const auto imag_b1 = _mm_load_ps (filter_bank.imag_b1.data() + k); + const auto imag_b2 = _mm_load_ps (filter_bank.imag_b2.data() + k); + + auto shared_z1 = _mm_load_ps (filter_bank.shared_z1.data() + k); + auto shared_z2 = _mm_load_ps (filter_bank.shared_z2.data() + k); + auto real_z1 = _mm_load_ps (filter_bank.real_z1.data() + k); + auto real_z2 = _mm_load_ps (filter_bank.real_z2.data() + k); + auto imag_z1 = _mm_load_ps (filter_bank.imag_z1.data() + k); + auto imag_z2 = _mm_load_ps (filter_bank.imag_z2.data() + k); + + for (int n = 0; n < num_samples; ++n) + { + const auto x_in = _mm_set1_ps (buffer_in[n]); + const auto [x_re, x_im] = process_sample_sse (x_in, a1, a2, shared_b0, shared_b1, shared_b2, real_b1, real_b2, imag_b1, imag_b2, shared_z1, shared_z2, real_z1, real_z2, imag_z1, imag_z2); + + auto x_re_sq = _mm_mul_ps (x_re, x_re); + auto x_im_sq = _mm_mul_ps (x_im, x_im); + auto x_abs_sq = _mm_add_ps (x_re_sq, x_im_sq); + + if constexpr (num_octaves_up == 1) + { + const auto greater_than_eps = _mm_cmpgt_ps (x_abs_sq, eps_sse); + const auto sqrt = _mm_rsqrt_ps (x_abs_sq); + const auto x_abs_r = _mm_and_ps (greater_than_eps, sqrt); + buffer_out_simd[n] = _mm_add_ps (buffer_out_simd[n], _mm_mul_ps (_mm_sub_ps (x_re_sq, x_im_sq), x_abs_r)); + } + else if constexpr (num_octaves_up == 2) + { + const auto greater_than_eps = _mm_cmpgt_ps (x_abs_sq, eps_sse); + const auto x_abs_sq_r = _mm_and_ps (greater_than_eps, _mm_rcp_ps (x_abs_sq)); + const auto x_im_sq_x3 = _mm_add_ps (x_im_sq, _mm_add_ps (x_im_sq, x_im_sq)); + buffer_out_simd[n] = _mm_add_ps (buffer_out_simd[n], _mm_mul_ps (_mm_sub_ps (x_re_sq, x_im_sq_x3), _mm_mul_ps (x_re, x_abs_sq_r))); + } + } + + _mm_store_ps (filter_bank.shared_z1.data() + k, shared_z1); + _mm_store_ps (filter_bank.shared_z2.data() + k, shared_z2); + _mm_store_ps (filter_bank.real_z1.data() + k, real_z1); + _mm_store_ps (filter_bank.real_z2.data() + k, real_z2); + _mm_store_ps (filter_bank.imag_z1.data() + k, imag_z1); + _mm_store_ps (filter_bank.imag_z2.data() + k, imag_z2); + } + + for (int n = 0; n < num_samples; ++n) + { + buffer_out_simd[n] = _mm_mul_ps (norm_gain_sse, buffer_out_simd[n]); + + auto rr = _mm_add_ps (_mm_shuffle_ps (buffer_out_simd[n], buffer_out_simd[n], 0x4e), buffer_out_simd[n]); + rr = _mm_add_ps (rr, _mm_shuffle_ps (rr, rr, 0xb1)); + buffer_out[n] = _mm_cvtss_f32 (rr); + } +#else // No SIMD + auto* buffer_out_simd = snapPointerToAlignment (reinterpret_cast (buffer_out), 4); + std::fill (buffer_out_simd, buffer_out_simd + num_samples, float {}); + + for (size_t k = 0; k < N; ++k) + { + const auto a1 = filter_bank.a1[k]; + const auto a2 = filter_bank.a2[k]; + const auto shared_b0 = filter_bank.shared_b0[k]; + const auto shared_b1 = filter_bank.shared_b1[k]; + const auto shared_b2 = filter_bank.shared_b2[k]; + const auto real_b1 = filter_bank.real_b1[k]; + const auto real_b2 = filter_bank.real_b2[k]; + const auto imag_b1 = filter_bank.imag_b1[k]; + const auto imag_b2 = filter_bank.imag_b2[k]; + + auto shared_z1 = filter_bank.shared_z1[k]; + auto shared_z2 = filter_bank.shared_z2[k]; + auto real_z1 = filter_bank.real_z1[k]; + auto real_z2 = filter_bank.real_z2[k]; + auto imag_z1 = filter_bank.imag_z1[k]; + auto imag_z2 = filter_bank.imag_z2[k]; + + for (int n = 0; n < num_samples; ++n) + { + const auto x_in = static_cast (buffer_in[n]); + const auto [x_re, x_im] = process_sample_soa (x_in, + a1, + a2, + shared_b0, + shared_b1, + shared_b2, + real_b1, + real_b2, + imag_b1, + imag_b2, + shared_z1, + shared_z2, + real_z1, + real_z2, + imag_z1, + imag_z2); + + auto x_re_sq = x_re * x_re; + auto x_im_sq = x_im * x_im; + auto x_abs_sq = x_re_sq + x_im_sq; + + if constexpr (num_octaves_up == 1) + { + auto x_abs_r = (x_abs_sq > eps) ? (1.0f / std::sqrt (x_abs_sq)) : 0.0f; + buffer_out_simd[n] += (x_re_sq - x_im_sq) * x_abs_r; + } + else if constexpr (num_octaves_up == 2) + { + auto x_abs_sq_r = (x_abs_sq > eps) ? (1.0f / x_abs_sq) : 0.0f; + buffer_out_simd[n] += x_re * (x_re_sq - 3.0f * x_im_sq) * x_abs_sq_r; + } + } + + filter_bank.shared_z1[k] = shared_z1; + filter_bank.shared_z2[k] = shared_z2; + filter_bank.real_z1[k] = real_z1; + filter_bank.real_z2[k] = real_z2; + filter_bank.imag_z1[k] = imag_z1; + filter_bank.imag_z2[k] = imag_z2; + } + + for (int n = 0; n < num_samples; ++n) + buffer_out[n] = norm_gain * buffer_out_simd[n]; +#endif +} + +template +void process_avx ([[maybe_unused]] ComplexERBFilterBank& filter_bank, + [[maybe_unused]] const float* buffer_in, + [[maybe_unused]] float* buffer_out, + [[maybe_unused]] int num_samples) noexcept +{ + // buffer_out size is padded by 8x + [[maybe_unused]] static constexpr auto eps = std::numeric_limits::epsilon(); + [[maybe_unused]] static constexpr auto norm_gain = 2.0f / static_cast (N); + +#if defined(__AVX__) + auto* buffer_out_simd = reinterpret_cast<__m256*> (snapPointerToAlignment (buffer_out, 32)); + std::fill (buffer_out_simd, buffer_out_simd + num_samples, __m256 {}); + + const auto eps_avx = _mm256_set1_ps (eps); + const auto norm_gain_avx = _mm256_set1_ps (norm_gain); + const auto one_avx = _mm256_set1_ps (1.0f); + for (size_t k = 0; k < N; k += 8) + { + const auto a1 = _mm256_load_ps (filter_bank.a1.data() + k); + const auto a2 = _mm256_load_ps (filter_bank.a2.data() + k); + const auto shared_b0 = _mm256_load_ps (filter_bank.shared_b0.data() + k); + const auto shared_b1 = _mm256_load_ps (filter_bank.shared_b1.data() + k); + const auto shared_b2 = _mm256_load_ps (filter_bank.shared_b2.data() + k); + const auto real_b1 = _mm256_load_ps (filter_bank.real_b1.data() + k); + const auto real_b2 = _mm256_load_ps (filter_bank.real_b2.data() + k); + const auto imag_b1 = _mm256_load_ps (filter_bank.imag_b1.data() + k); + const auto imag_b2 = _mm256_load_ps (filter_bank.imag_b2.data() + k); + + auto shared_z1 = _mm256_load_ps (filter_bank.shared_z1.data() + k); + auto shared_z2 = _mm256_load_ps (filter_bank.shared_z2.data() + k); + auto real_z1 = _mm256_load_ps (filter_bank.real_z1.data() + k); + auto real_z2 = _mm256_load_ps (filter_bank.real_z2.data() + k); + auto imag_z1 = _mm256_load_ps (filter_bank.imag_z1.data() + k); + auto imag_z2 = _mm256_load_ps (filter_bank.imag_z2.data() + k); + + for (int n = 0; n < num_samples; ++n) + { + const auto x_in = _mm256_set1_ps (buffer_in[n]); + const auto [x_re, x_im] = process_sample_avx (x_in, a1, a2, shared_b0, shared_b1, shared_b2, real_b1, real_b2, imag_b1, imag_b2, shared_z1, shared_z2, real_z1, real_z2, imag_z1, imag_z2); + + auto x_re_sq = _mm256_mul_ps (x_re, x_re); + auto x_im_sq = _mm256_mul_ps (x_im, x_im); + auto x_abs_sq = _mm256_add_ps (x_re_sq, x_im_sq); + + if constexpr (num_octaves_up == 1) + { + // const auto greater_than_eps = _mm256_cmpgt_ps (x_abs_sq, eps_avx); + const auto greater_than_eps = _mm256_cmp_ps (x_abs_sq, eps_avx, _CMP_GT_OQ); + const auto sqrt = _mm256_rsqrt_ps (x_abs_sq); + const auto x_abs_r = _mm256_and_ps (greater_than_eps, sqrt); + buffer_out_simd[n] = _mm256_add_ps (buffer_out_simd[n], _mm256_mul_ps (_mm256_sub_ps (x_re_sq, x_im_sq), x_abs_r)); + } + else if constexpr (num_octaves_up == 2) + { + // const auto greater_than_eps = _mm256_cmpgt_ps (x_abs_sq, eps_avx); + const auto greater_than_eps = _mm256_cmp_ps (x_abs_sq, eps_avx, _CMP_GT_OQ); + const auto x_abs_sq_r = _mm256_and_ps (greater_than_eps, _mm256_rcp_ps (x_abs_sq)); + const auto x_im_sq_x3 = _mm256_add_ps (x_im_sq, _mm256_add_ps (x_im_sq, x_im_sq)); + buffer_out_simd[n] = _mm256_add_ps (buffer_out_simd[n], _mm256_mul_ps (_mm256_sub_ps (x_re_sq, x_im_sq_x3), _mm256_mul_ps (x_re, x_abs_sq_r))); + } + } + + _mm256_store_ps (filter_bank.shared_z1.data() + k, shared_z1); + _mm256_store_ps (filter_bank.shared_z2.data() + k, shared_z2); + _mm256_store_ps (filter_bank.real_z1.data() + k, real_z1); + _mm256_store_ps (filter_bank.real_z2.data() + k, real_z2); + _mm256_store_ps (filter_bank.imag_z1.data() + k, imag_z1); + _mm256_store_ps (filter_bank.imag_z2.data() + k, imag_z2); + } + + for (int n = 0; n < num_samples; ++n) + { + buffer_out_simd[n] = _mm256_mul_ps (norm_gain_avx, buffer_out_simd[n]); + + __m256 rr = _mm256_dp_ps (buffer_out_simd[n], one_avx, 0xff); + __m256 tmp = _mm256_permute2f128_ps (rr, rr, 1); + rr = _mm256_add_ps (rr, tmp); + buffer_out[n] = _mm256_cvtss_f32 (rr); + } +#endif +} + +template void design_filter_bank (std::array, 2>&, double, double, double, double); +template void process<1, N1> (ComplexERBFilterBank&, const float*, float*, int) noexcept; +template void process<2, N1> (ComplexERBFilterBank&, const float*, float*, int) noexcept; +#if defined(__AVX__) +template void process_avx<1, N1> (ComplexERBFilterBank&, const float*, float*, int) noexcept; +template void process_avx<2, N1> (ComplexERBFilterBank&, const float*, float*, int) noexcept; +#endif +} // namespace poly_octave_v2 diff --git a/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.h b/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.h new file mode 100644 index 00000000..751a2770 --- /dev/null +++ b/src/processors/other/poly_octave/PolyOctaveV2FilterBankImpl.h @@ -0,0 +1,28 @@ +#pragma once + +#include "PolyOctaveV2FilterBank.h" + +namespace poly_octave_v2 +{ +// Reference for filter-bank design and octave shifting: +// https://aaltodoc.aalto.fi/server/api/core/bitstreams/ff9e52cf-fd79-45eb-b695-93038244ec0e/content + +template +void design_filter_bank (std::array, 2>& filter_bank, + double gamma, + double erb_start, + double q_ERB, + double sample_rate); + +template +void process (ComplexERBFilterBank& filter_bank, + const float* buffer_in, + float* buffer_out, + int num_samples) noexcept; + +template +void process_avx (ComplexERBFilterBank& filter_bank, + const float* buffer_in, + float* buffer_out, + int num_samples) noexcept; +} // namespace poly_octave_v2