From ec26662ac9f1e89b2556d81fa311f91ab3abd492 Mon Sep 17 00:00:00 2001
From: Alexandre Mutel <alexandre_mutel@live.com>
Date: Sat, 5 Nov 2022 16:05:12 +0100
Subject: [PATCH] Optimize XxHash3 on ARM platform (#77881)

* Optimize XxHash3 on ARM platform

* Extract code to MultiplyWideningLower

* Update src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHash3.cs

Co-authored-by: Stephen Toub <stoub@microsoft.com>

Co-authored-by: Stephen Toub <stoub@microsoft.com>
---
 .../src/System/IO/Hashing/XxHash3.cs          | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHash3.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHash3.cs
index 4e821618d36e3e..17087d87631036 100644
--- a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHash3.cs
+++ b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/XxHash3.cs
@@ -10,6 +10,7 @@
 using System.Runtime.InteropServices;
 #if NET7_0_OR_GREATER
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 #endif
 
@@ -896,16 +897,31 @@ private static Vector128<ulong> Accumulate128(Vector128<ulong> accVec, byte* sou
             Vector128<uint> sourceKey = sourceVec ^ secret;
 
             // TODO: Figure out how to unwind this shuffle and just use Vector128.Multiply
-            Vector128<uint> sourceKeyLow = Vector128.Shuffle(sourceKey, Vector128.Create(1u, 0, 3, 0));
             Vector128<uint> sourceSwap = Vector128.Shuffle(sourceVec, Vector128.Create(2u, 3, 0, 1));
             Vector128<ulong> sum = accVec + sourceSwap.AsUInt64();
-            Vector128<ulong> product = Sse2.IsSupported ?
-                Sse2.Multiply(sourceKey, sourceKeyLow) :
-                (sourceKey & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceKeyLow & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64();
 
+            Vector128<ulong> product = MultiplyWideningLower(sourceKey);
             accVec = product + sum;
             return accVec;
         }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<ulong> MultiplyWideningLower(Vector128<uint> source)
+        {
+            if (AdvSimd.IsSupported)
+            {
+                Vector64<uint> sourceLow = Vector128.Shuffle(source, Vector128.Create(0u, 2, 0, 0)).GetLower();
+                Vector64<uint> sourceHigh = Vector128.Shuffle(source, Vector128.Create(1u, 3, 0, 0)).GetLower();
+                return AdvSimd.MultiplyWideningLower(sourceLow, sourceHigh);
+            }
+            else
+            {
+                Vector128<uint> sourceLow = Vector128.Shuffle(source, Vector128.Create(1u, 0, 3, 0));
+                return Sse2.IsSupported ?
+                    Sse2.Multiply(source, sourceLow) :
+                    (source & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64() * (sourceLow & Vector128.Create(~0u, 0u, ~0u, 0u)).AsUInt64();
+            }
+        }
 #endif
 
         private static void ScrambleAccumulators(ulong* accumulators, byte* secret)