From f2def6755e966ed0c0d28f2d3b8b57c7277b311f Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Sun, 14 Jul 2024 09:37:19 -0400 Subject: [PATCH] [Kernel] Turn off CUTLASS scaled_mm for Ada Lovelace (#6384) --- ...eta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml | 8 ++++---- .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml | 6 +++--- csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu | 10 ++++++++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml index e40f42a17c18f..374171f1f915b 100644 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml @@ -1,11 +1,11 @@ -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 250 -f 5 -t 1 +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.752 + value: 0.755 - name: "exact_match,flexible-extract" - value: 0.752 -limit: 250 + value: 0.755 +limit: 1000 num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml index 7a89e8e0c76f2..dc36b705634f9 100644 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml @@ -4,8 +4,8 @@ tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.756 + value: 0.753 - name: "exact_match,flexible-extract" - value: 0.752 -limit: 250 + value: 0.753 +limit: 1000 num_fewshot: 5 diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 81bf2d62d8f42..605166930ccc6 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { if (cuda_device_capability >= 90) { return CUDA_VERSION >= 12000; } else if (cuda_device_capability >= 89) { - return CUDA_VERSION >= 12040; + // CUTLASS Kernels have not been tuned for Ada Lovelace systems + // and are slower than torch.mm. Return false unconditionally in this case. + return false; + + // Once the CUTLASS kernels have been optimized for Lovelace systems, + // use the following check: + // return CUDA_VERSION >= 12040; } #endif @@ -98,4 +104,4 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(version_num >= 75); cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); } -} \ No newline at end of file +}