feat: add benchmark test for prefill.

SciSharp · May 7, 2024 · 413c23c · 413c23c
1 parent c07c4cc
commit 413c23c
Show file tree

Hide file tree

Showing 6 changed files with 5,549 additions and 2 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -71,5 +71,5 @@ jobs:
       if: always()
       uses: actions/upload-artifact@v3
       with:
-        path: logs/ # TODO: change it
-        name: logs
+        name: Benchmark_Results
+        path: BenchmarkDotNet.Artifacts/results/*
diff --git a/LLama.Benchmark/Assets/TextCompletionPrompts.txt b/LLama.Benchmark/Assets/TextCompletionPrompts.txt
diff --git a/LLama.Benchmark/Common.cs b/LLama.Benchmark/Common.cs
@@ -0,0 +1,15 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace LLama.Benchmark
+{
+    public enum ExecutorType
+    {
+        Interactive,
+        Instruct,
+        Stateless
+    }
+}
diff --git a/LLama.Benchmark/Constants.cs b/LLama.Benchmark/Constants.cs
@@ -0,0 +1,21 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace LLama.Benchmark
+{
+    internal static class Constants
+    {
+        public static readonly string Generative7BModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
+        public static readonly string EmbeddingModelPath = "Models/all-MiniLM-L12-v2.Q8_0.gguf";
+
+        public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
+        public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
+        public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
+
+        public static readonly string TextCompletionPromptsFilePath = "Assets/TextCompletionPrompts.txt";
+    }
+}
diff --git a/LLama.Benchmark/LLama.Benchmark.csproj b/LLama.Benchmark/LLama.Benchmark.csproj
@@ -29,6 +29,10 @@
         <None Update="Models\llama-2-7b-chat.Q3_K_S.gguf">
             <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         </None>
+
+        <None Update="Assets\TextCompletionPrompts.txt">
+            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        </None>
     </ItemGroup>
 
 </Project>
diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@@ -0,0 +1,128 @@
+#pragma warning disable CS8618
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Engines;
+using BenchmarkDotNet.Jobs;
+using LLama.Abstractions;
+using LLama.Common;
+using Microsoft.VisualBasic;
+
+namespace LLama.Benchmark.LLamaExecutorBenchmark
+{
+    [BenchmarkCategory("LLamaExecutor")]
+    [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
+    [MinWarmupCount(1)]
+    [MaxWarmupCount(2)]
+    [MinIterationCount(1)]
+    [MaxIterationCount(16)]
+    public class PrefillBenchmark
+    {
+        /// <summary>
+        /// (prompt length, context length)
+        /// </summary>
+        public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[] 
+        {
+            (512, 2048),
+            (2024, 2048)
+        };
+
+        /// <summary>
+        /// (model path, gpu layer count)
+        /// </summary>
+        public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
+        // TODO: specify the native library to load here to test cpu case better.
+        {
+            (Constants.Generative7BModelPath, 0),
+            (Constants.Generative7BModelPath, 10),
+            (Constants.Generative7BModelPath, 20)
+        };
+
+        public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
+        {
+            ExecutorType.Interactive,
+            ExecutorType.Stateless
+        };
+
+        [ParamsSource(nameof(PromptAndContextLengths))]
+        public (int, uint) PromptAndContextLength { get; set; }
+
+        [ParamsSource(nameof(ModelAndGpuLayerCounts))]
+        public (string, int) ModelAndGpuLayerCount { get; set; }
+
+        [ParamsSource(nameof(ExecutorTypes))]
+        public ExecutorType ExecutorType { get; set; }
+
+        /// <summary>
+        /// Params used to create a model.
+        /// </summary>
+        public ModelParams ModelParams { get; set; }
+
+        /// <summary>
+        /// Params used in inference.
+        /// </summary>
+        public InferenceParams InferenceParams { get; set; }
+
+        /// <summary>
+        /// Prompt used to run text generation.
+        /// </summary>
+        public string Prompt { get; set; }
+
+        public ILLamaExecutor Executor { get; set; }
+
+        private void InitializeParamsAndModel()
+        {
+            ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
+            {
+                ContextSize = PromptAndContextLength.Item2,
+                GpuLayerCount = ModelAndGpuLayerCount.Item2
+            };
+            Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
+            InferenceParams = new InferenceParams()
+            {
+                Temperature = 0.6f,
+                MaxTokens = 1 // Only prefill, no generation here.
+            };
+
+            LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
+            LLamaContext context = weights.CreateContext(ModelParams);
+            Executor = ExecutorType switch
+            {
+                ExecutorType.Interactive => new InteractiveExecutor(context),
+                ExecutorType.Instruct => new InstructExecutor(context),
+                ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
+                _ => throw new NotSupportedException()
+            };
+        }
+
+        [GlobalSetup(Targets = [nameof(Basic)])]
+        public void GlobalSetup()
+        {
+            InitializeParamsAndModel();
+        }
+
+        [IterationCleanup(Targets = [nameof(Basic)])]
+        public void GlobalCleanup()
+        {
+            if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
+            {
+                Executor.Context.NativeHandle.KvCacheClear();
+            }
+        }
+
+        [Benchmark]
+        public async Task<string> Basic()
+        {
+            StringBuilder sb = new();
+            await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
+            {
+                sb.Append(text);
+            }
+            return sb.ToString();
+        }
+    }
+}