From de262c9047dd740867b998254c4bbb6020506a0b Mon Sep 17 00:00:00 2001 From: cobalt-github-releaser-bot <95661244+cobalt-github-releaser-bot@users.noreply.github.com> Date: Sat, 27 Jan 2024 09:22:44 -0800 Subject: [PATCH] Cherry pick PR #2300: [media] Enable NEON optimization for WSOLA algorithm (#2311) Refer to the original PR: https://github.com/youtube/cobalt/pull/2300 From https://codereview.chromium.org/2527533002, the NEON intrinsic speeds up MultiChannelDotProduct() by ~9x on ARM. b/315159208 Co-authored-by: Bo-Rong Chen --- starboard/build/config/BUILD.gn | 3 +++ .../shared/starboard/player/filter/wsola_internal.cc | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/starboard/build/config/BUILD.gn b/starboard/build/config/BUILD.gn index 6a0290a7a328c..5fd66dcb2e65e 100644 --- a/starboard/build/config/BUILD.gn +++ b/starboard/build/config/BUILD.gn @@ -183,6 +183,9 @@ config("native_target_build") { config("starboard_implementation") { # This allows the benchmarks to include internal only header files. defines = [ "STARBOARD_IMPLEMENTATION" ] + if (current_cpu == "arm64" || (current_cpu == "arm" && arm_use_neon)) { + defines += [ "USE_NEON" ] + } } config("speed") { diff --git a/starboard/shared/starboard/player/filter/wsola_internal.cc b/starboard/shared/starboard/player/filter/wsola_internal.cc index deb29289c813f..ad4e1bf9243c3 100644 --- a/starboard/shared/starboard/player/filter/wsola_internal.cc +++ b/starboard/shared/starboard/player/filter/wsola_internal.cc @@ -33,11 +33,13 @@ #include "starboard/common/scoped_ptr.h" #include "starboard/memory.h" -// TODO: Detect Neon on ARM platform and enable SIMD. #if SB_IS(ARCH_X86) || SB_IS(ARCH_X64) #define USE_SIMD 1 #include -#endif // SB_IS(ARCH_X86) || SB_IS(ARCH_X64) +#elif (SB_IS(ARCH_ARM) || SB_IS(ARCH_ARM64)) && defined(USE_NEON) +#define USE_SIMD 1 +#include +#endif namespace starboard { namespace shared { @@ -111,7 +113,7 @@ void MultiChannelDotProduct(const scoped_refptr& a, // Reduce to a single float for this channel. float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum)); dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0); -#endif // SB_IS(ARCH_X86) || SB_IS(ARCH_X64) +#endif } if (!rem) {