Fix the attention bug caused by upgrading vLLM (#555)

predibase · Jul 26, 2024 · 2e81331 · 2e81331
1 parent 5cefe6e
commit 2e81331
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/server/lorax_server/utils/paged_attention.py b/server/lorax_server/utils/paged_attention.py
@@ -34,7 +34,7 @@ def reshape_and_cache(
     if SYSTEM == "xpu":
         ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slots)
     else:
-        torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0)
+        torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0, 1.0)
 
 
 def attention(
@@ -108,6 +108,7 @@ def attention(
             None,
             "fp8" if fp8_supported else "auto",
             1.0,
+            1.0
         )
     else:
         # Run PagedAttention V2.
@@ -141,4 +142,5 @@ def attention(
             None,
             "fp8" if fp8_supported else "auto",
             1.0,
+            1.0
         )