Update heuristic for Cutlass BF16 Grouped GEMM

Chris Thi · facebook-github-bot · commit 05a77f35b60b · 2025-05-16T07:14:07.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1220 This diff updates the heuristic used for Cutlass BF16 grouped gemm, improving performance in some important shapes. Differential Revision: D74836650
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu
@@ -15,17 +15,106 @@ namespace fbgemm_gpu {
 
 #if CUDART_VERSION >= 12000
 
-// FP8 Tensorwise grouped cutlass kernel dispatch.
+// BF16 grouped cutlass kernel dispatch.
 template <typename InputType>
 at::Tensor dispatch_bf16_grouped_kernel(
+    int G,
     int total_M,
+    int N,
+    int K,
     InputType X, // BF16
     InputType W, // BF16
     at::Tensor output,
     std::optional<at::Tensor> zero_start_index_M = std::nullopt,
     std::optional<at::Tensor> M_sizes = std::nullopt) {
   // Use heuristics to pick best kernel implementation.
 
+  // Llama4 128E
+  if (G == 128) {
+    if (N == 5120 && K == 1024) {
+      if (total_M <= 128) {
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_t(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 2048) {
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 4096) {
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 8192) {
+        return bf16bf16bf16_grouped_128_64_128_1_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 16384) {
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_t(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else {
+        return bf16bf16bf16_grouped_128_256_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      }
+    }
+
+    if (N == 2048 && K == 5120) {
+      if (total_M <= 2048) {
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_t(
+            X, W, output, zero_start_index_M, M_sizes);
+      }
+    }
+  }
+
+  // Llama4 64E
+  if (G == 16) {
+    if (N == 5120 && K == 1024) {
+      if (total_M <= 32) {
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 64) {
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_t(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 512) {
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_t(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 1024) {
+        return bf16bf16bf16_grouped_128_64_128_2_1_1_t(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else {
+        return bf16bf16bf16_grouped_128_256_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      }
+    }
+
+    if (N == 2048 && K == 5120) {
+      if (total_M <= 16) {
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 64) {
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 256) {
+        return bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 512) {
+        return bf16bf16bf16_grouped_128_32_128_2_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else if (total_M <= 1024) {
+        return bf16bf16bf16_grouped_128_64_128_1_1_1_f(
+            X, W, output, zero_start_index_M, M_sizes);
+      } else {
+        return bf16bf16bf16_grouped_128_128_128_2_1_1_t(
+            X, W, output, zero_start_index_M, M_sizes);
+      }
+    }
+  }
+
+  // Fallback to legacy heuristic for now.
   if (total_M <= 16) {
     return bf16bf16bf16_grouped_128_16_128_1_1_1_f(
         X, W, output, zero_start_index_M, M_sizes);
@@ -52,13 +141,18 @@ OutputType _bf16bf16bf16_grouped(at::TensorList X, at::TensorList W) {
   at::Tensor Y;
   int64_t total_M = 0;
   int64_t G = X.size();
+  int64_t max_N = 0;
+  int64_t max_K = 0;
 
   // Allocate output tensor.
   std::vector<int64_t> output_sizes;
   int64_t total_output_size = 0;
   for (int i = 0; i < G; ++i) {
     int64_t M = X[i].size(0);
     int64_t N = W[i].size(0);
+    int64_t K = W[i].size(1);
+    max_N = std::max(max_N, N);
+    max_K = std::max(max_K, K);
     total_M += M;
     const int64_t output_size = M * N;
     total_output_size += output_size;
@@ -67,8 +161,8 @@ OutputType _bf16bf16bf16_grouped(at::TensorList X, at::TensorList W) {
   Y = at::empty(total_output_size, X[0].options().dtype(at::kBFloat16));
 
   // Run kernel.
-  at::Tensor g_out =
-      dispatch_bf16_grouped_kernel<at::TensorList>(total_M, X, W, Y);
+  at::Tensor g_out = dispatch_bf16_grouped_kernel<at::TensorList>(
+      G, total_M, max_N, max_K, X, W, Y);
 
   // Return appropriate output type.
   if constexpr (std::is_same_v<OutputType, at::Tensor>) {
@@ -98,6 +192,7 @@ at::Tensor
 bf16bf16bf16_grouped_stacked(at::Tensor X, at::Tensor W, at::Tensor M_sizes) {
   int64_t total_M = X.size(0);
   int64_t N = W.size(1);
+  int64_t K = W.size(2);
   int64_t G = M_sizes.size(0);
   TORCH_CHECK(
       M_sizes.device() == X.device(),
@@ -111,7 +206,7 @@ bf16bf16bf16_grouped_stacked(at::Tensor X, at::Tensor W, at::Tensor M_sizes) {
   }
   // Return continuous view of output.
   at::Tensor out = dispatch_bf16_grouped_kernel<at::Tensor>(
-      total_M, X, W, Y, std::nullopt, M_sizes);
+      G, total_M, N, K, X, W, Y, std::nullopt, M_sizes);
   return out.view({total_M, N});
 }
 
@@ -125,13 +220,14 @@ at::Tensor bf16bf16bf16_grouped_dynamic(
   int64_t G = X.size(0);
   int64_t M = X.size(1);
   int64_t N = W.size(1);
+  int64_t K = W.size(2);
   int64_t total_output_size = G * M * N;
   at::Tensor Y;
   Y = at::zeros(total_output_size, X.options().dtype(at::kBFloat16));
 
   // Return continuous view of output.
   at::Tensor output = dispatch_bf16_grouped_kernel<at::Tensor>(
-      G * M, X, W, Y, zero_start_index_M);
+      G, G * M, N, K, X, W, Y, zero_start_index_M);
   // View as proper shape.
   return output.view({G, M, N});
 }
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_2_1_1_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_2_1_1_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_2_1_1_t(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 128, 128, 2, 1, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_2_1_1_t(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      128,
+      128,
+      2,
+      1,
+      1,
+      true>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_16_128_2_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_16_128_2_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 16, 128, 2, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_16_128_2_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      16,
+      128,
+      2,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_32_128_2_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_32_128_2_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_32_128_2_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 32, 128, 2, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_32_128_2_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      32,
+      128,
+      2,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_32_128_2_1_1_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_32_128_2_1_1_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_32_128_2_1_1_t(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 32, 128, 2, 1, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_32_128_2_1_1_t(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::TensorList, 128, 32, 128, 2, 1, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_64_128_2_1_1_t.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_64_128_2_1_1_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_64_128_2_1_1_t(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 64, 128, 2, 1, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_64_128_2_1_1_t(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::TensorList, 128, 64, 128, 2, 1, 1, true>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh