From de262c9047dd740867b998254c4bbb6020506a0b Mon Sep 17 00:00:00 2001
From: cobalt-github-releaser-bot
 <95661244+cobalt-github-releaser-bot@users.noreply.github.com>
Date: Sat, 27 Jan 2024 09:22:44 -0800
Subject: [PATCH] Cherry pick PR #2300: [media] Enable NEON optimization for
 WSOLA algorithm (#2311)

Refer to the original PR: https://github.com/youtube/cobalt/pull/2300

From https://codereview.chromium.org/2527533002, the NEON intrinsic
speeds up MultiChannelDotProduct() by ~9x on ARM.

b/315159208

Co-authored-by: Bo-Rong Chen <borongchen@google.com>
---
 starboard/build/config/BUILD.gn                           | 3 +++
 .../shared/starboard/player/filter/wsola_internal.cc      | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/starboard/build/config/BUILD.gn b/starboard/build/config/BUILD.gn
index 6a0290a7a328c..5fd66dcb2e65e 100644
--- a/starboard/build/config/BUILD.gn
+++ b/starboard/build/config/BUILD.gn
@@ -183,6 +183,9 @@ config("native_target_build") {
 config("starboard_implementation") {
   # This allows the benchmarks to include internal only header files.
   defines = [ "STARBOARD_IMPLEMENTATION" ]
+  if (current_cpu == "arm64" || (current_cpu == "arm" && arm_use_neon)) {
+    defines += [ "USE_NEON" ]
+  }
 }
 
 config("speed") {
diff --git a/starboard/shared/starboard/player/filter/wsola_internal.cc b/starboard/shared/starboard/player/filter/wsola_internal.cc
index deb29289c813f..ad4e1bf9243c3 100644
--- a/starboard/shared/starboard/player/filter/wsola_internal.cc
+++ b/starboard/shared/starboard/player/filter/wsola_internal.cc
@@ -33,11 +33,13 @@
 #include "starboard/common/scoped_ptr.h"
 #include "starboard/memory.h"
 
-// TODO: Detect Neon on ARM platform and enable SIMD.
 #if SB_IS(ARCH_X86) || SB_IS(ARCH_X64)
 #define USE_SIMD 1
 #include <xmmintrin.h>
-#endif  // SB_IS(ARCH_X86) || SB_IS(ARCH_X64)
+#elif (SB_IS(ARCH_ARM) || SB_IS(ARCH_ARM64)) && defined(USE_NEON)
+#define USE_SIMD 1
+#include <arm_neon.h>
+#endif
 
 namespace starboard {
 namespace shared {
@@ -111,7 +113,7 @@ void MultiChannelDotProduct(const scoped_refptr<DecodedAudio>& a,
     // Reduce to a single float for this channel.
     float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
     dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
-#endif  // SB_IS(ARCH_X86) || SB_IS(ARCH_X64)
+#endif
   }
 
   if (!rem) {