-
-
Notifications
You must be signed in to change notification settings - Fork 856
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add ARM version of calculating mode scores #2356
Changes from 7 commits
cbeeca5
7483802
af0b7bf
7ed4c69
2f673b9
a526d84
e345857
b0bfb0a
ae7306b
344cca9
963d993
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
using System.Runtime.CompilerServices; | ||
using System.Runtime.InteropServices; | ||
using System.Runtime.Intrinsics; | ||
using System.Runtime.Intrinsics.Arm; | ||
using System.Runtime.Intrinsics.X86; | ||
|
||
// ReSharper disable InconsistentNaming | ||
|
@@ -26,6 +27,11 @@ public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b) | |
return Vp8_Sse16xN_Sse2(a, b, 8); | ||
} | ||
|
||
if (AdvSimd.IsSupported) | ||
{ | ||
return Vp8_Sse16x16_Neon(a, b); | ||
} | ||
|
||
return Vp8_SseNxN(a, b, 16, 16); | ||
} | ||
|
||
|
@@ -43,6 +49,11 @@ public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b) | |
return Vp8_Sse16xN_Sse2(a, b, 4); | ||
} | ||
|
||
if (AdvSimd.IsSupported) | ||
{ | ||
return Vp8_Sse16x8_Neon(a, b); | ||
} | ||
|
||
return Vp8_SseNxN(a, b, 16, 8); | ||
} | ||
|
||
|
@@ -119,6 +130,11 @@ public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b) | |
return Numerics.ReduceSum(sum); | ||
} | ||
|
||
if (AdvSimd.IsSupported) | ||
{ | ||
return Vp8_Sse4x4_Neon(a, b); | ||
} | ||
|
||
return Vp8_SseNxN(a, b, 4, 4); | ||
} | ||
|
||
|
@@ -199,6 +215,91 @@ private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs) | |
return Numerics.ReduceSum(sum); | ||
} | ||
|
||
[MethodImpl(InliningOptions.ShortMethod)] | ||
private static int Vp8_Sse16x16_Neon(Span<byte> a, Span<byte> b) | ||
{ | ||
Vector128<uint> sum = Vector128<uint>.Zero; | ||
ref byte aRef = ref MemoryMarshal.GetReference(a); | ||
ref byte bRef = ref MemoryMarshal.GetReference(b); | ||
for (int y = 0; y < 16; y++) | ||
{ | ||
sum = AccumulateSSE16Neon( | ||
ref Unsafe.Add(ref aRef, y * WebpConstants.Bps), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using pointers could allow emitting better code for address calculation. We can then increment the pointers as
This would improve address calculation.
after: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The generated code looks indeed a bit better with pointers. I was not aware of that. Here is a SharpLab gist: Sse16x16_NeonPointers |
||
ref Unsafe.Add(ref bRef, y * WebpConstants.Bps), | ||
sum); | ||
} | ||
|
||
return Numerics.ReduceSumArm(sum); | ||
} | ||
|
||
[MethodImpl(InliningOptions.ShortMethod)] | ||
private static int Vp8_Sse16x8_Neon(Span<byte> a, Span<byte> b) | ||
{ | ||
Vector128<uint> sum = Vector128<uint>.Zero; | ||
ref byte aRef = ref MemoryMarshal.GetReference(a); | ||
ref byte bRef = ref MemoryMarshal.GetReference(b); | ||
for (int y = 0; y < 8; y++) | ||
{ | ||
sum = AccumulateSSE16Neon( | ||
ref Unsafe.Add(ref aRef, y * WebpConstants.Bps), | ||
ref Unsafe.Add(ref bRef, y * WebpConstants.Bps), | ||
sum); | ||
} | ||
|
||
return Numerics.ReduceSumArm(sum); | ||
} | ||
|
||
[MethodImpl(InliningOptions.ShortMethod)] | ||
private static int Vp8_Sse4x4_Neon(Span<byte> a, Span<byte> b) | ||
{ | ||
Vector128<byte> a0 = Load4x4Neon(a).AsByte(); | ||
Vector128<byte> b0 = Load4x4Neon(b).AsByte(); | ||
Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0); | ||
Vector64<byte> absDiffLower = absDiff.GetLower().AsByte(); | ||
Vector64<byte> absDiffUpper = absDiff.GetUpper().AsByte(); | ||
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); | ||
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); | ||
|
||
// pair-wise adds and widen. | ||
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1); | ||
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2); | ||
|
||
return Numerics.ReduceSumArm(AdvSimd.Add(sum1, sum2)); | ||
} | ||
|
||
// Load all 4x4 pixels into a single Vector128<uint> | ||
[MethodImpl(InliningOptions.ShortMethod)] | ||
private static unsafe Vector128<uint> Load4x4Neon(Span<byte> src) | ||
{ | ||
fixed (byte* srcRef = &MemoryMarshal.GetReference(src)) | ||
{ | ||
Vector128<uint> output = Vector128<uint>.Zero; | ||
output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef); | ||
output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps)); | ||
output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2))); | ||
output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3))); | ||
return output; | ||
} | ||
} | ||
|
||
[MethodImpl(InliningOptions.ShortMethod)] | ||
private static Vector128<uint> AccumulateSSE16Neon(ref byte aRef, ref byte bRef, Vector128<uint> sum) | ||
{ | ||
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef); | ||
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef); | ||
|
||
Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0); | ||
Vector64<byte> absDiffLower = absDiff.GetLower(); | ||
Vector64<byte> absDiffUpper = absDiff.GetUpper(); | ||
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); | ||
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); | ||
|
||
// pair-wise adds and widen. | ||
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1); | ||
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2); | ||
return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2)); | ||
} | ||
|
||
[MethodImpl(InliningOptions.ShortMethod)] | ||
private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b) | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can use
Vector128<T>.sum()
instead of this method. In general, try using Vector128/Vector256 API wherever possible. This would improve portability of the code and benefit from improvements to the API itself.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
ReduceSum
can also be refactored out.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We cannot get rid of
ReduceSum
yet, because we targetnet6.0
and theVector128<T>.sum
was introduced withnet7.0
.I am using
Vector128<T>.sum
for >=Net7.0
: b0bfb0aThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, makes sense 👍