[Kernel] Turn off cutlass scaled_mm for Lovelace

vllm-project · Jul 12, 2024 · b9eaa67 · b9eaa67
1 parent 55f692b
commit b9eaa67
Showing 1 changed file with 8 additions and 2 deletions.
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   if (cuda_device_capability >= 90) {
     return CUDA_VERSION >= 12000;
   } else if (cuda_device_capability >= 89) {
-    return CUDA_VERSION >= 12040;
+    // CUTLASS Kernels have not been tuned for Ada Lovelace systems
+    // and are slower than torch.mm. Return false unconditionally in this case.
+    return false;
+
+    // Once the CUTLASS kernels have been optimized for Lovelace systems,
+    // use the following check:
+    // return CUDA_VERSION >= 12040;
   }
 #endif
 
@@ -98,4 +104,4 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
     TORCH_CHECK(version_num >= 75);
     cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
   }
-}
+}