How to take into account memory access #2513
-
I wrote a very simple benchmark for comparing whether it makes sense to compute using System;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
namespace Benchmark;
public static class Program
{
/// <summary>
/// Entry point of the benchmark.
/// </summary>
public static void Main()
{
_ = BenchmarkRunner.Run<BenchmarkTest>();
}
}
/// <summary>
/// Benchmark class for a comparison of current code and a proposed patch.
/// </summary>
/// <seealso href="https://benchmarkdotnet.org/articles/guides/"/>
public class BenchmarkTest
{
private static readonly decimal[] precomputed = new decimal[]
{
1m,
0.1m,
0.01m,
0.001m,
0.0001m,
0.00001m,
0.000001m,
0.0000001m,
0.00000001m,
0.000000001m,
0.0000000001m,
0.00000000001m,
0.000000000001m,
0.0000000000001m,
0.00000000000001m,
0.000000000000001m,
0.0000000000000001m,
0.00000000000000001m,
0.000000000000000001m,
0.0000000000000000001m,
};
/// <summary>
/// Executed once.
/// </summary>
[GlobalSetup]
public void GlobalSetup()
{
}
[Benchmark]
public decimal MasterBranch()
{
decimal r = 0;
for (int decimals = 0; decimals < 5; decimals++)
{
r += (decimal)Math.Pow(10, -decimals);
}
return r;
}
[Benchmark]
public decimal PrBranch()
{
decimal r = 0;
for (int decimals = 0; decimals < 5; decimals++)
{
r += precomputed[decimals];
}
return r;
}
} Now results are
But now the feedback I got is that Do you agree with the feedback I got? Could anyone give me a tip how to improve the benchmark a bit? Thanks you! |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 3 replies
-
The feedback is correct. It is a difficult thing to measure the speed of cache vs ram in C#. I gave it a try and got these results:
Code
public class BenchmarkTest
{
const int CountToMeasure = 5;
private static readonly decimal[] precomputedCache = new decimal[]
{
1m,
0.1m,
0.01m,
0.001m,
0.0001m,
0.00001m,
0.000001m,
0.0000001m,
0.00000001m,
0.000000001m,
0.0000000001m,
0.00000000001m,
0.000000000001m,
0.0000000000001m,
0.00000000000001m,
0.000000000000001m,
0.0000000000000001m,
0.00000000000000001m,
0.000000000000000001m,
0.0000000000000000001m,
};
private static readonly decimal[][] precomputedRAM = GetPrecomputedRAM();
private static List<byte[]> spacers;
private static decimal[][] GetPrecomputedRAM()
{
Processor.GetPerCoreCacheSizes(out var l1, out var l2, out var l3);
var totalCacheSize = l1 + l2 + l3;
// 16 is size of decimal in bytes, plus array overhead.
var precomputedSize = (precomputedCache.Length * 128) + IntPtr.Size * 3;
// We copy the array as many times as needed to fill the cache.
int copyCount = 0;
for (int cacheFillSize = 0; cacheFillSize < totalCacheSize; cacheFillSize += precomputedSize)
{
++copyCount;
}
// + CountToMeasure more to be sure that we're pulling from RAM and not cache and that we have a minimum size.
copyCount += CountToMeasure;
var array = new decimal[copyCount][];
spacers = new List<byte[]>();
for (int i = 0; i < copyCount; ++i)
{
// Allocate a spacer array the size of the cache line size to prevent the cpu from pulling in mulitple caches when reading from RAM.
// You can probably get the actual value from the processor information, but I just used the common 4k size because I'm lazy.
spacers.Add(new byte[64*64]);
// Copy the cache.
array[i] = precomputedCache.ToArray();
}
return array;
}
int currentIndex = 0;
static readonly int ramMod = precomputedRAM.Length;
[Benchmark]
public decimal Calculate()
{
decimal r = 0;
for (int decimals = 0; decimals < CountToMeasure; decimals++)
{
r += (decimal) Math.Pow(10, -decimals);
// We do the index calculations to match the extra work of Ram.
++currentIndex;
currentIndex %= ramMod;
}
return r;
}
[Benchmark]
public decimal Cache()
{
decimal r = 0;
for (int decimals = 0; decimals < CountToMeasure; decimals++)
{
r += precomputedCache[decimals];
// We do the index calculations to match the extra work of Ram.
++currentIndex;
currentIndex %= ramMod;
}
return r;
}
[Benchmark]
public decimal Ram()
{
decimal r = 0;
for (int decimals = 0; decimals < CountToMeasure; decimals++)
{
r += precomputedRAM[currentIndex][decimals];
++currentIndex;
currentIndex %= ramMod;
}
return r;
}
}
class Processor
{
[DllImport("kernel32.dll")]
public static extern int GetCurrentThreadId();
//[DllImport("kernel32.dll")]
//public static extern int GetCurrentProcessorNumber();
[StructLayout(LayoutKind.Sequential, Pack = 4)]
private struct GROUP_AFFINITY
{
public UIntPtr Mask;
[MarshalAs(UnmanagedType.U2)]
public ushort Group;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 3, ArraySubType = UnmanagedType.U2)]
public ushort[] Reserved;
}
[DllImport("kernel32", SetLastError = true)]
private static extern Boolean SetThreadGroupAffinity(IntPtr hThread, ref GROUP_AFFINITY GroupAffinity, ref GROUP_AFFINITY PreviousGroupAffinity);
[StructLayout(LayoutKind.Sequential)]
public struct PROCESSORCORE
{
public byte Flags;
};
[StructLayout(LayoutKind.Sequential)]
public struct NUMANODE
{
public uint NodeNumber;
}
public enum PROCESSOR_CACHE_TYPE
{
CacheUnified,
CacheInstruction,
CacheData,
CacheTrace
}
[StructLayout(LayoutKind.Sequential)]
public struct CACHE_DESCRIPTOR
{
public byte Level;
public byte Associativity;
public ushort LineSize;
public uint Size;
public PROCESSOR_CACHE_TYPE Type;
}
[StructLayout(LayoutKind.Explicit)]
public struct SYSTEM_LOGICAL_PROCESSOR_INFORMATION_UNION
{
[FieldOffset(0)]
public PROCESSORCORE ProcessorCore;
[FieldOffset(0)]
public NUMANODE NumaNode;
[FieldOffset(0)]
public CACHE_DESCRIPTOR Cache;
[FieldOffset(0)]
private UInt64 Reserved1;
[FieldOffset(8)]
private UInt64 Reserved2;
}
public enum LOGICAL_PROCESSOR_RELATIONSHIP
{
RelationProcessorCore,
RelationNumaNode,
RelationCache,
RelationProcessorPackage,
RelationGroup,
RelationAll = 0xffff
}
public struct SYSTEM_LOGICAL_PROCESSOR_INFORMATION
{
#pragma warning disable 0649
public UIntPtr ProcessorMask;
public LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
public SYSTEM_LOGICAL_PROCESSOR_INFORMATION_UNION ProcessorInformation;
#pragma warning restore 0649
}
[DllImport(@"kernel32.dll", SetLastError = true)]
public static extern bool GetLogicalProcessorInformation(IntPtr Buffer, ref uint ReturnLength);
private const int ERROR_INSUFFICIENT_BUFFER = 122;
private static SYSTEM_LOGICAL_PROCESSOR_INFORMATION[] _logicalProcessorInformation = null;
public static SYSTEM_LOGICAL_PROCESSOR_INFORMATION[] LogicalProcessorInformation
{
get
{
if (_logicalProcessorInformation != null)
return _logicalProcessorInformation;
uint ReturnLength = 0;
GetLogicalProcessorInformation(IntPtr.Zero, ref ReturnLength);
if (Marshal.GetLastWin32Error() == ERROR_INSUFFICIENT_BUFFER)
{
IntPtr Ptr = Marshal.AllocHGlobal((int) ReturnLength);
try
{
if (GetLogicalProcessorInformation(Ptr, ref ReturnLength))
{
int size = Marshal.SizeOf(typeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
int len = (int) ReturnLength / size;
_logicalProcessorInformation = new SYSTEM_LOGICAL_PROCESSOR_INFORMATION[len];
IntPtr Item = Ptr;
for (int i = 0; i < len; i++)
{
_logicalProcessorInformation[i] = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION) Marshal.PtrToStructure(Item, typeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
Item += size;
}
return _logicalProcessorInformation;
}
}
finally
{
Marshal.FreeHGlobal(Ptr);
}
}
return null;
}
}
public static void GetPerCoreCacheSizes(out Int64 L1, out Int64 L2, out Int64 L3)
{
L1 = 0;
L2 = 0;
L3 = 0;
var info = Processor.LogicalProcessorInformation;
foreach (var entry in info)
{
if (entry.Relationship != Processor.LOGICAL_PROCESSOR_RELATIONSHIP.RelationCache)
continue;
Int64 mask = (Int64) entry.ProcessorMask;
if ((mask & (Int64) 1) == 0)
continue;
var cache = entry.ProcessorInformation.Cache;
switch (cache.Level)
{
case 1: L1 = L1 + cache.Size; break;
case 2: L2 = L2 + cache.Size; break;
case 3: L3 = L3 + cache.Size; break;
default:
break;
}
}
}
} Credit for the And honestly, I don't even trust my own results, because it's common knowledge that Ram is 10-100x slower than cache. |
Beta Was this translation helpful? Give feedback.
Take 3 (I told you it's difficult!)
Code