Prefix Caching- fix t4 triton error (vllm-project#2517)

xjpang · Feb 20, 2024 · 120b2fd · 120b2fd
1 parent b475897
commit 120b2fd
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -618,7 +618,9 @@ def context_attention_fwd(q,
                               b_ctx_len,
                               max_input_len,
                               alibi_slopes=None):
-        BLOCK = 128
+
+        cap = torch.cuda.get_device_capability()
+        BLOCK = 128 if cap[0] >= 8 else 64
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv