fix: conditionally enable LlamaRAMCache (#83)

superlinear-ai · Jan 6, 2025 · 8f4bd5f · 8f4bd5f
1 parent b19963d
commit 8f4bd5f
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py
@@ -29,6 +29,7 @@
     CreateChatCompletionStreamResponse,
     Llama,
     LlamaRAMCache,
+    llama_supports_gpu_offload,
 )
 
 from raglite._chatml_function_calling import chatml_function_calling_with_streaming
@@ -126,7 +127,8 @@ def llm(model: str, **kwargs: Any) -> Llama:
                 **kwargs,
             )
         # Enable caching.
-        llm.set_cache(LlamaRAMCache())
+        if llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 8:  # noqa: PLR2004
+            llm.set_cache(LlamaRAMCache())
         # Register the model info with LiteLLM.
         model_info = {
             repo_id_filename: {