From ce4de7d607b6c3b52996083035bcb6680c5c5d4e Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Wed, 13 Mar 2024 00:33:16 +0000 Subject: [PATCH] llama_decode lock (#595) * Added a lock object into `SafeLlamaModelHandle` which all calls to `llama_decode` (in the `SafeLLamaContextHandle`) lock first. This prevents two contexts from running inference on the same model at the same time, which seems to be unsafe in llama.cpp. * Modified the lock to be global over _all_ inferences. This seems to be necessary (at least with the CUDA backend). --- LLama/Native/SafeLLamaContextHandle.cs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 6d244998e..2f881fa5d 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -192,6 +192,18 @@ public uint TokenToSpan(LLamaToken token, Span dest) #endregion #region infer + /// + /// This object exists to ensure there is only ever 1 inference running at a time. This is a workaround for thread safety issues in llama.cpp itself. + /// Most notably CUDA, which seems to use some global singleton resources and will crash if multiple inferences are run (even against different models). + /// + /// For more information see these issues: + /// - https://github.com/SciSharp/LLamaSharp/issues/596 + /// - https://github.com/ggerganov/llama.cpp/issues/3960 + /// + /// If these are ever resolved this lock can probably be removed. + /// + private static readonly object GlobalInferenceLock = new(); + /// /// /// @@ -202,8 +214,9 @@ public uint TokenToSpan(LLamaToken token, Span dest) /// public DecodeResult Decode(LLamaBatch batch) { - using (batch.ToNativeBatch(out var nb)) - return (DecodeResult)NativeApi.llama_decode(this, nb); + lock (GlobalInferenceLock) + using (batch.ToNativeBatch(out var nb)) + return (DecodeResult)NativeApi.llama_decode(this, nb); } ///