Merge branch 'master' into Development

SignalRT · Oct 31, 2023 · 8b49a84 · 8b49a84
2 parents a62c54c + 5ed3b2f
commit 8b49a84
Show file tree

Hide file tree

Showing 43 changed files with 670 additions and 242 deletions.
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -40,7 +40,7 @@ jobs:
           mkdir build
           cd build
           cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release -j $(nproc)
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
       - uses: actions/upload-artifact@v3
         with:
           path: ./build/libllama.so
@@ -53,13 +53,13 @@ jobs:
       matrix:
         include:
           - build: 'noavx'
-            defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx2'
-            defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx'
-            defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx512'
-            defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
     runs-on: windows-latest
     steps:
       - uses: actions/checkout@v3
@@ -118,7 +118,7 @@ jobs:
           mkdir build
           cd build
           cmake .. -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF
-          cmake --build . --config Release -j4
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
           ls -R
 
       - name: Upload artifacts (Windows)
@@ -158,7 +158,7 @@ jobs:
           mkdir build
           cd build
           cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
       - name: Upload artifacts
         uses: actions/upload-artifact@v3
         with:
@@ -189,13 +189,13 @@ jobs:
           ls -R
 
           mkdir deps
+          mkdir deps/avx
+          mkdir deps/avx2
+          mkdir deps/avx512
 
           cp artifacts/llama-bin-linux-noavx-x64.so/libllama.so deps/libllama.so
-          mkdir deps/avx
           cp artifacts/llama-bin-linux-avx-x64.so/libllama.so deps/avx/libllama.so
-          mkdir deps/avx2
           cp artifacts/llama-bin-linux-avx2-x64.so/libllama.so deps/avx2/libllama.so
-          mkdir deps/avx512
           cp artifacts/llama-bin-linux-avx512-x64.so/libllama.so deps/avx512/libllama.so
 
           cp artifacts/llama-bin-win-noavx-x64.dll/llama.dll deps/libllama.dll
@@ -210,15 +210,15 @@ jobs:
           cp artifacts/llama-bin-macos-metal.dylib/libllama.dylib deps/macos-metal/libllama.dylib
           cp artifacts/ggml-metal.metal/ggml-metal.metal deps/macos-metal/ggml-metal.metal
 
-
       - name: Rearrange CUDA files
         if: ${{ github.event.inputs.cublas }}
         run: |
           mkdir cuda_deps
           mkdir cuda_deps/cu11.7.1
+          mkdir cuda_deps/cu12.1.0
+
           cp artifacts/llama-bin-win-cublas-cu11.7.1-x64.dll/llama.dll cuda_deps/cu11.7.1/libllama.dll
           cp artifacts/llama-bin-linux-cublas-cu11.7.1-x64.so/libllama.so cuda_deps/cu11.7.1/libllama.so
-          mkdir cuda_deps/cu12.1.0
           cp artifacts/llama-bin-win-cublas-cu12.1.0-x64.dll/llama.dll cuda_deps/cu12.1.0/libllama.dll
           cp artifacts/llama-bin-linux-cublas-cu12.1.0-x64.so/libllama.so cuda_deps/cu12.1.0/libllama.so
 

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -35,17 +35,14 @@ jobs:
         key: "unit_test_models"
         path: LLama.Unittest/Models
     #  workaround for actions/setup-dotnet#155
-    - name: Clear package cache
-      run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
     - name: Restore packages
       run: dotnet restore LLamaSharp.sln
-    - name: Build
-      run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
     - name: Test
-      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt
-    - name: Upload artifacts
-      if: always()
+      run: dotnet test -c ${{ matrix.config }} --logger trx --results-directory "TestResults-${{ matrix.dotnet-version }}"
+    - name: Upload dotnet test results
       uses: actions/upload-artifact@v3
       with:
-        path: logs/
-        name: logs
+        name: dotnet-results-${{ matrix.dotnet-version }}
+        path: TestResults-${{ matrix.dotnet-version }}
+
+
diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj
@@ -8,6 +8,7 @@
     <Platforms>AnyCPU;x64</Platforms>
     <!-- Set IncludeBuiltInRuntimes to false to include your own runtime libraries and not link the defaults -->
     <IncludeBuiltInRuntimes>true</IncludeBuiltInRuntimes>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">

diff --git a/LLama.Examples/NewVersion/BatchedDecoding.cs b/LLama.Examples/NewVersion/BatchedDecoding.cs
@@ -0,0 +1,177 @@
+using System.Diagnostics;
+using System.Security.Cryptography;
+using System.Text;
+using LLama.Common;
+using LLama.Native;
+
+namespace LLama.Examples.NewVersion;
+
+/// <summary>
+/// This demonstrates generating multiple replies to the same prompt, with a shared cache
+/// </summary>
+/// <remarks>Note that this is currently using the low level API directly, future work will provide a safer C# wrapper over this!</remarks>
+public class BatchedDecoding
+{
+    private const int n_parallel = 8;
+    private const int n_len = 32;
+
+    private const int top_k = 80;
+    private const float top_p = 0.8f;
+    private const float temp = 0.5f;
+
+    public static async Task Run()
+    {
+        Console.Write("Please input your model path: ");
+        var modelPath = Console.ReadLine();
+
+        Console.WriteLine("Prompt (leave blank to select automatically):");
+        var prompt = Console.ReadLine();
+        if (string.IsNullOrWhiteSpace(prompt))
+            prompt = "Not many people know that";
+
+        // Load model
+        var parameters = new ModelParams(modelPath);
+        using var model = LLamaWeights.LoadFromFile(parameters);
+
+        // Tokenize prompt
+        var prompt_tokens = model.NativeHandle.Tokenize(prompt, true, false, Encoding.UTF8);
+        var n_kv_req = prompt_tokens.Length + (n_len - prompt_tokens.Length) * n_parallel;
+
+        // Create a context
+        parameters.ContextSize = (uint)model.ContextSize;
+        parameters.BatchSize = (uint)Math.Max(n_len, n_parallel);
+        using var context = model.CreateContext(parameters);
+
+        var n_ctx = context.ContextSize;
+
+        // make sure the KV cache is big enough to hold all the prompt and generated tokens
+        if (n_kv_req > n_ctx)
+        {
+            await Console.Error.WriteLineAsync($"error: n_kv_req ({n_kv_req}) > n_ctx, the required KV cache size is not big enough\n");
+            await Console.Error.WriteLineAsync("        either reduce n_parallel or increase n_ctx\n");
+            return;
+        }
+
+        using var batch = LLamaBatchSafeHandle.Create(Math.Max(prompt_tokens.Length, n_parallel), 0, 1);
+
+        // evaluate the initial prompt
+        for (var i = 0; i < prompt_tokens.Length; i++)
+            batch.LLamaBatchAdd(prompt_tokens[i], i, new[] { (LLamaSeqId)0 }, false);
+        Debug.Assert(batch.NativeBatch.n_tokens == prompt_tokens.Length);
+
+        // llama_decode will output logits only for the last token of the prompt
+        unsafe
+        {
+            batch.NativeBatch.logits[batch.NativeBatch.n_tokens - 1] = 1;
+        }
+
+        if (context.NativeHandle.Decode(batch) != 0)
+        {
+            await Console.Error.WriteLineAsync("llama_decode failed");
+            return;
+        }
+
+        // assign the system KV cache to all parallel sequences
+        // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+        for (var i = 1; i < n_parallel; ++i)
+        {
+            NativeApi.llama_kv_cache_seq_cp(context.NativeHandle, (LLamaSeqId)0, (LLamaSeqId)i, 0, batch.NativeBatch.n_tokens);
+        }
+
+        if (n_parallel > 1)
+        {
+            Console.WriteLine();
+            Console.WriteLine($"generating {n_parallel} sequences...");
+        }
+
+        // remember the batch index of the last token for each parallel sequence
+        // we need this to determine which logits to sample from
+        List<int> i_batch = new();
+        for (var i = 0; i < n_parallel; i++)
+            i_batch.Add(batch.NativeBatch.n_tokens - 1);
+
+        var n_cur = batch.NativeBatch.n_tokens;
+        var n_decode = 0;
+
+        var streams = new List<int>[n_parallel];
+        for (var i = 0; i < n_parallel; i++)
+            streams[i] = new();
+
+        var eos = model.EndOfSentenceToken;
+        var nl = model.NewlineToken;
+
+        var timer = new Stopwatch();
+        timer.Start();
+        while (n_cur <= n_len)
+        {
+            batch.LLamaBatchClear();
+
+            for (var i = 0; i < n_parallel; i++)
+            {
+                // Skip completed streams
+                if (i_batch[i] < 0)
+                    continue;
+
+                var n_vocab = model.VocabCount;
+                LLamaTokenDataArray candidates;
+                unsafe
+                {
+                    candidates = LLamaTokenDataArray.Create(new Span<float>(NativeApi.llama_get_logits_ith(context.NativeHandle, i_batch[i]), n_vocab));
+                }
+
+                candidates.TopK(context.NativeHandle, top_k);
+                candidates.TopP(context.NativeHandle, top_p);
+                candidates.Temperature(context.NativeHandle, temp);
+                var new_token_id = candidates.SampleToken(context.NativeHandle);
+
+                if (new_token_id == eos || new_token_id == nl)
+                {
+                    i_batch[i] = -1;
+                    Console.WriteLine($"Completed Stream {i} early");
+                    continue;
+                }
+
+                streams[i].Add(new_token_id);
+
+                i_batch[i] = batch.NativeBatch.n_tokens;
+
+                // push this new token for next evaluation
+                batch.LLamaBatchAdd(new_token_id, n_cur, new[] { (LLamaSeqId)i }, true);
+
+                n_decode++;
+            }
+
+            // all streams are finished
+            if (batch.NativeBatch.n_tokens == 0)
+            {
+                break;
+            }
+
+            n_cur++;
+
+            // evaluate the current batch with the transformer model
+            if (context.NativeHandle.Decode(batch) != 0)
+            {
+                await Console.Error.WriteLineAsync("failed to eval");
+                return;
+            }
+        }
+
+        timer.Stop();
+        Console.ForegroundColor = ConsoleColor.Yellow;
+        Console.WriteLine();
+        Console.WriteLine($"Decoded {n_decode} tokens in {timer.ElapsedMilliseconds}ms");
+        Console.WriteLine($"Rate: {n_decode / timer.Elapsed.TotalSeconds:##.000} tokens/second");
+
+        var index = 0;
+        foreach (var stream in streams)
+        {
+            var text = context.DeTokenize(stream);
+
+            Console.ForegroundColor = ConsoleColor.Green;
+            Console.Write($"{index++}. {prompt}");
+            Console.ForegroundColor = ConsoleColor.Red;
+            Console.WriteLine(text);
+        }
+    }
+}
diff --git a/LLama.Examples/NewVersion/SemanticKernelChat.cs b/LLama.Examples/NewVersion/SemanticKernelChat.cs
@@ -14,10 +14,7 @@ public static async Task Run()
             var modelPath = Console.ReadLine();
 
             // Load weights into memory
-            var parameters = new ModelParams(modelPath)
-            {
-                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue)),
-            };
+            var parameters = new ModelParams(modelPath);
             using var model = LLamaWeights.LoadFromFile(parameters);
             using var context = model.CreateContext(parameters);
             var ex = new InteractiveExecutor(context);

diff --git a/LLama.Examples/NewVersion/SemanticKernelPrompt.cs b/LLama.Examples/NewVersion/SemanticKernelPrompt.cs
@@ -16,10 +16,7 @@ public static async Task Run()
             var modelPath = Console.ReadLine();
 
             // Load weights into memory
-            var parameters = new ModelParams(modelPath)
-            {
-                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
-            };
+            var parameters = new ModelParams(modelPath);
             using var model = LLamaWeights.LoadFromFile(parameters);
             var ex = new StatelessExecutor(model, parameters);
 

diff --git a/LLama.Examples/NewVersion/TalkToYourself.cs b/LLama.Examples/NewVersion/TalkToYourself.cs
@@ -13,10 +13,7 @@ public static async Task Run()
             var modelPath = Console.ReadLine();
 
             // Load weights into memory
-            var @params = new ModelParams(modelPath)
-            {
-                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
-            };
+            var @params = new ModelParams(modelPath);
             using var weights = LLamaWeights.LoadFromFile(@params);
 
             // Create 2 contexts sharing the same weights

diff --git a/LLama.Examples/NewVersion/TestRunner.cs b/LLama.Examples/NewVersion/TestRunner.cs
@@ -22,6 +22,7 @@ public static async Task Run()
             Console.WriteLine("12: Semantic Kernel Chat.");
             Console.WriteLine("13: Semantic Kernel Memory.");
             Console.WriteLine("14: Coding Assistant.");
+            Console.WriteLine("15: Batch Decoding.");
 
             while (true)
             {
@@ -88,6 +89,10 @@ public static async Task Run()
                 {
                     await CodingAssistant.Run();
                 }
+                else if (choice == 15)
+                {
+                    await BatchedDecoding.Run();
+                }
                 else
                 {
                     Console.WriteLine("Cannot parse your choice. Please select again.");

diff --git a/LLama.Unittest/StatelessExecutorTest.cs b/LLama.Unittest/StatelessExecutorTest.cs
@@ -1,3 +1,4 @@
+using System.Diagnostics;
 using LLama.Common;
 using Xunit.Abstractions;
 
@@ -34,10 +35,17 @@ public async Task Stateless()
             const string question = "Question. what is a cat?\nAnswer: ";
             var @params = new InferenceParams { MaxTokens = 32, AntiPrompts = new[] { "." } };
 
+            var timer = new Stopwatch();
+            timer.Start();
+
             var result1 = string.Join("", await executor.InferAsync(question, @params).ToListAsync());
             var result2 = string.Join("", await executor.InferAsync(question, @params).ToListAsync());
 
+            timer.Stop();
+            _testOutputHelper.WriteLine($"{timer.ElapsedMilliseconds}ms");
+
             _testOutputHelper.WriteLine(result1);
+            _testOutputHelper.WriteLine(result2);
 
             // Check that it produced the exact same result both times
             Assert.Equal(result1, result2);

diff --git a/LLama.Web/Common/InferenceOptions.cs b/LLama.Web/Common/InferenceOptions.cs
@@ -23,7 +23,7 @@ public class InferenceOptions : IInferenceParams
         /// <summary>
         /// Sequences where the model will stop generating further tokens.
         /// </summary>
-        public IEnumerable<string> AntiPrompts { get; set; } = Array.Empty<string>();
+        public IReadOnlyList<string> AntiPrompts { get; set; } = Array.Empty<string>();
         /// <summary>
         /// path to file for saving/loading model eval state
         /// </summary>

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -111,12 +111,12 @@ public class ModelOptions
         /// <summary>
         /// RoPE base frequency
         /// </summary>
-        public float RopeFrequencyBase { get; set; } = 10000.0f;
+        public float? RopeFrequencyBase { get; set; }
 
         /// <summary>
         /// RoPE frequency scaling factor
         /// </summary>
-        public float RopeFrequencyScale { get; set; } = 1.0f;
+        public float? RopeFrequencyScale { get; set; }
 
         /// <summary>
         /// Use experimental mul_mat_q kernels