Skip to content

Commit

Permalink
Merge branch 'master' into Development
Browse files Browse the repository at this point in the history
  • Loading branch information
SignalRT committed Oct 31, 2023
2 parents a62c54c + 5ed3b2f commit 8b49a84
Show file tree
Hide file tree
Showing 43 changed files with 670 additions and 242 deletions.
24 changes: 12 additions & 12 deletions .github/workflows/compile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
mkdir build
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release -j $(nproc)
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- uses: actions/upload-artifact@v3
with:
path: ./build/libllama.so
Expand All @@ -53,13 +53,13 @@ jobs:
matrix:
include:
- build: 'noavx'
defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2'
defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx'
defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512'
defines: '-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=OFF -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
runs-on: windows-latest
steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -118,7 +118,7 @@ jobs:
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=OFF
cmake --build . --config Release -j4
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
ls -R
- name: Upload artifacts (Windows)
Expand Down Expand Up @@ -158,7 +158,7 @@ jobs:
mkdir build
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Upload artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -189,13 +189,13 @@ jobs:
ls -R
mkdir deps
mkdir deps/avx
mkdir deps/avx2
mkdir deps/avx512
cp artifacts/llama-bin-linux-noavx-x64.so/libllama.so deps/libllama.so
mkdir deps/avx
cp artifacts/llama-bin-linux-avx-x64.so/libllama.so deps/avx/libllama.so
mkdir deps/avx2
cp artifacts/llama-bin-linux-avx2-x64.so/libllama.so deps/avx2/libllama.so
mkdir deps/avx512
cp artifacts/llama-bin-linux-avx512-x64.so/libllama.so deps/avx512/libllama.so
cp artifacts/llama-bin-win-noavx-x64.dll/llama.dll deps/libllama.dll
Expand All @@ -210,15 +210,15 @@ jobs:
cp artifacts/llama-bin-macos-metal.dylib/libllama.dylib deps/macos-metal/libllama.dylib
cp artifacts/ggml-metal.metal/ggml-metal.metal deps/macos-metal/ggml-metal.metal
- name: Rearrange CUDA files
if: ${{ github.event.inputs.cublas }}
run: |
mkdir cuda_deps
mkdir cuda_deps/cu11.7.1
mkdir cuda_deps/cu12.1.0
cp artifacts/llama-bin-win-cublas-cu11.7.1-x64.dll/llama.dll cuda_deps/cu11.7.1/libllama.dll
cp artifacts/llama-bin-linux-cublas-cu11.7.1-x64.so/libllama.so cuda_deps/cu11.7.1/libllama.so
mkdir cuda_deps/cu12.1.0
cp artifacts/llama-bin-win-cublas-cu12.1.0-x64.dll/llama.dll cuda_deps/cu12.1.0/libllama.dll
cp artifacts/llama-bin-linux-cublas-cu12.1.0-x64.so/libllama.so cuda_deps/cu12.1.0/libllama.so
Expand Down
15 changes: 6 additions & 9 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,14 @@ jobs:
key: "unit_test_models"
path: LLama.Unittest/Models
# workaround for actions/setup-dotnet#155
- name: Clear package cache
run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
- name: Restore packages
run: dotnet restore LLamaSharp.sln
- name: Build
run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
- name: Test
run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt
- name: Upload artifacts
if: always()
run: dotnet test -c ${{ matrix.config }} --logger trx --results-directory "TestResults-${{ matrix.dotnet-version }}"
- name: Upload dotnet test results
uses: actions/upload-artifact@v3
with:
path: logs/
name: logs
name: dotnet-results-${{ matrix.dotnet-version }}
path: TestResults-${{ matrix.dotnet-version }}


1 change: 1 addition & 0 deletions LLama.Examples/LLama.Examples.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
<Platforms>AnyCPU;x64</Platforms>
<!-- Set IncludeBuiltInRuntimes to false to include your own runtime libraries and not link the defaults -->
<IncludeBuiltInRuntimes>true</IncludeBuiltInRuntimes>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
Expand Down
177 changes: 177 additions & 0 deletions LLama.Examples/NewVersion/BatchedDecoding.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
using System.Diagnostics;
using System.Security.Cryptography;
using System.Text;
using LLama.Common;
using LLama.Native;

namespace LLama.Examples.NewVersion;

/// <summary>
/// This demonstrates generating multiple replies to the same prompt, with a shared cache
/// </summary>
/// <remarks>Note that this is currently using the low level API directly, future work will provide a safer C# wrapper over this!</remarks>
public class BatchedDecoding
{
private const int n_parallel = 8;
private const int n_len = 32;

private const int top_k = 80;
private const float top_p = 0.8f;
private const float temp = 0.5f;

public static async Task Run()
{
Console.Write("Please input your model path: ");
var modelPath = Console.ReadLine();

Console.WriteLine("Prompt (leave blank to select automatically):");
var prompt = Console.ReadLine();
if (string.IsNullOrWhiteSpace(prompt))
prompt = "Not many people know that";

// Load model
var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);

// Tokenize prompt
var prompt_tokens = model.NativeHandle.Tokenize(prompt, true, false, Encoding.UTF8);
var n_kv_req = prompt_tokens.Length + (n_len - prompt_tokens.Length) * n_parallel;

// Create a context
parameters.ContextSize = (uint)model.ContextSize;
parameters.BatchSize = (uint)Math.Max(n_len, n_parallel);
using var context = model.CreateContext(parameters);

var n_ctx = context.ContextSize;

// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx)
{
await Console.Error.WriteLineAsync($"error: n_kv_req ({n_kv_req}) > n_ctx, the required KV cache size is not big enough\n");
await Console.Error.WriteLineAsync(" either reduce n_parallel or increase n_ctx\n");
return;
}

using var batch = LLamaBatchSafeHandle.Create(Math.Max(prompt_tokens.Length, n_parallel), 0, 1);

// evaluate the initial prompt
for (var i = 0; i < prompt_tokens.Length; i++)
batch.LLamaBatchAdd(prompt_tokens[i], i, new[] { (LLamaSeqId)0 }, false);
Debug.Assert(batch.NativeBatch.n_tokens == prompt_tokens.Length);

// llama_decode will output logits only for the last token of the prompt
unsafe
{
batch.NativeBatch.logits[batch.NativeBatch.n_tokens - 1] = 1;
}

if (context.NativeHandle.Decode(batch) != 0)
{
await Console.Error.WriteLineAsync("llama_decode failed");
return;
}

// assign the system KV cache to all parallel sequences
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
for (var i = 1; i < n_parallel; ++i)
{
NativeApi.llama_kv_cache_seq_cp(context.NativeHandle, (LLamaSeqId)0, (LLamaSeqId)i, 0, batch.NativeBatch.n_tokens);
}

if (n_parallel > 1)
{
Console.WriteLine();
Console.WriteLine($"generating {n_parallel} sequences...");
}

// remember the batch index of the last token for each parallel sequence
// we need this to determine which logits to sample from
List<int> i_batch = new();
for (var i = 0; i < n_parallel; i++)
i_batch.Add(batch.NativeBatch.n_tokens - 1);

var n_cur = batch.NativeBatch.n_tokens;
var n_decode = 0;

var streams = new List<int>[n_parallel];
for (var i = 0; i < n_parallel; i++)
streams[i] = new();

var eos = model.EndOfSentenceToken;
var nl = model.NewlineToken;

var timer = new Stopwatch();
timer.Start();
while (n_cur <= n_len)
{
batch.LLamaBatchClear();

for (var i = 0; i < n_parallel; i++)
{
// Skip completed streams
if (i_batch[i] < 0)
continue;

var n_vocab = model.VocabCount;
LLamaTokenDataArray candidates;
unsafe
{
candidates = LLamaTokenDataArray.Create(new Span<float>(NativeApi.llama_get_logits_ith(context.NativeHandle, i_batch[i]), n_vocab));
}

candidates.TopK(context.NativeHandle, top_k);
candidates.TopP(context.NativeHandle, top_p);
candidates.Temperature(context.NativeHandle, temp);
var new_token_id = candidates.SampleToken(context.NativeHandle);

if (new_token_id == eos || new_token_id == nl)
{
i_batch[i] = -1;
Console.WriteLine($"Completed Stream {i} early");
continue;
}

streams[i].Add(new_token_id);

i_batch[i] = batch.NativeBatch.n_tokens;

// push this new token for next evaluation
batch.LLamaBatchAdd(new_token_id, n_cur, new[] { (LLamaSeqId)i }, true);

n_decode++;
}

// all streams are finished
if (batch.NativeBatch.n_tokens == 0)
{
break;
}

n_cur++;

// evaluate the current batch with the transformer model
if (context.NativeHandle.Decode(batch) != 0)
{
await Console.Error.WriteLineAsync("failed to eval");
return;
}
}

timer.Stop();
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine();
Console.WriteLine($"Decoded {n_decode} tokens in {timer.ElapsedMilliseconds}ms");
Console.WriteLine($"Rate: {n_decode / timer.Elapsed.TotalSeconds:##.000} tokens/second");

var index = 0;
foreach (var stream in streams)
{
var text = context.DeTokenize(stream);

Console.ForegroundColor = ConsoleColor.Green;
Console.Write($"{index++}. {prompt}");
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine(text);
}
}
}
5 changes: 1 addition & 4 deletions LLama.Examples/NewVersion/SemanticKernelChat.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ public static async Task Run()
var modelPath = Console.ReadLine();

// Load weights into memory
var parameters = new ModelParams(modelPath)
{
Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue)),
};
var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
var ex = new InteractiveExecutor(context);
Expand Down
5 changes: 1 addition & 4 deletions LLama.Examples/NewVersion/SemanticKernelPrompt.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@ public static async Task Run()
var modelPath = Console.ReadLine();

// Load weights into memory
var parameters = new ModelParams(modelPath)
{
Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
};
var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);
var ex = new StatelessExecutor(model, parameters);

Expand Down
5 changes: 1 addition & 4 deletions LLama.Examples/NewVersion/TalkToYourself.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@ public static async Task Run()
var modelPath = Console.ReadLine();

// Load weights into memory
var @params = new ModelParams(modelPath)
{
Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
};
var @params = new ModelParams(modelPath);
using var weights = LLamaWeights.LoadFromFile(@params);

// Create 2 contexts sharing the same weights
Expand Down
5 changes: 5 additions & 0 deletions LLama.Examples/NewVersion/TestRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public static async Task Run()
Console.WriteLine("12: Semantic Kernel Chat.");
Console.WriteLine("13: Semantic Kernel Memory.");
Console.WriteLine("14: Coding Assistant.");
Console.WriteLine("15: Batch Decoding.");

while (true)
{
Expand Down Expand Up @@ -88,6 +89,10 @@ public static async Task Run()
{
await CodingAssistant.Run();
}
else if (choice == 15)
{
await BatchedDecoding.Run();
}
else
{
Console.WriteLine("Cannot parse your choice. Please select again.");
Expand Down
8 changes: 8 additions & 0 deletions LLama.Unittest/StatelessExecutorTest.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System.Diagnostics;
using LLama.Common;
using Xunit.Abstractions;

Expand Down Expand Up @@ -34,10 +35,17 @@ public async Task Stateless()
const string question = "Question. what is a cat?\nAnswer: ";
var @params = new InferenceParams { MaxTokens = 32, AntiPrompts = new[] { "." } };

var timer = new Stopwatch();
timer.Start();

var result1 = string.Join("", await executor.InferAsync(question, @params).ToListAsync());
var result2 = string.Join("", await executor.InferAsync(question, @params).ToListAsync());

timer.Stop();
_testOutputHelper.WriteLine($"{timer.ElapsedMilliseconds}ms");

_testOutputHelper.WriteLine(result1);
_testOutputHelper.WriteLine(result2);

// Check that it produced the exact same result both times
Assert.Equal(result1, result2);
Expand Down
2 changes: 1 addition & 1 deletion LLama.Web/Common/InferenceOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public class InferenceOptions : IInferenceParams
/// <summary>
/// Sequences where the model will stop generating further tokens.
/// </summary>
public IEnumerable<string> AntiPrompts { get; set; } = Array.Empty<string>();
public IReadOnlyList<string> AntiPrompts { get; set; } = Array.Empty<string>();
/// <summary>
/// path to file for saving/loading model eval state
/// </summary>
Expand Down
4 changes: 2 additions & 2 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ public class ModelOptions
/// <summary>
/// RoPE base frequency
/// </summary>
public float RopeFrequencyBase { get; set; } = 10000.0f;
public float? RopeFrequencyBase { get; set; }

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float RopeFrequencyScale { get; set; } = 1.0f;
public float? RopeFrequencyScale { get; set; }

/// <summary>
/// Use experimental mul_mat_q kernels
Expand Down
Loading

0 comments on commit 8b49a84

Please sign in to comment.