Refactor Cutlass BF16 Grouped GEMM (#4124)

Chris Thi · facebook-github-bot · commit 9932686b1159 · 2025-05-20T17:20:59.000-07:00
Summary: Pull Request resolved: #4124 X-link: facebookresearch/FBGEMM#1205 We plan to make some changes to the kernel heuristics to improve performance on this kernel. Do a quick refactor first to parallelize kernel compilation, similar with [cutlass FP8 rowwise](https://www.internalfb.com/code/fbsource/fbcode/deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise/), to keep the next diffs smaller. No functional changes in this diff. Reviewed By: jianyuh Differential Revision: D74760416 fbshipit-source-id: 138fbc8b62e6d22ed60448e79050c4d1ebd470aa
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_128_128_1_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 128, 128, 1, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_128_128_1_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      128,
+      128,
+      1,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_16_128_1_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_16_128_1_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_16_128_1_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 16, 128, 1, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_16_128_1_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      16,
+      128,
+      1,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_256_128_1_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_256_128_1_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_256_128_1_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 256, 128, 1, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_256_128_1_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      256,
+      128,
+      1,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_256_128_2_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_256_128_2_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_256_128_2_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 256, 128, 2, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_256_128_2_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      256,
+      128,
+      2,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_32_128_1_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_32_128_1_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_32_128_1_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 32, 128, 1, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_32_128_1_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      32,
+      128,
+      1,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_64_128_1_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_128_64_128_1_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_128_64_128_1_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 128, 64, 128, 1, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_128_64_128_1_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      128,
+      64,
+      128,
+      1,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_256_128_128_2_1_1_f.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_256_128_128_2_1_1_f.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "bf16bf16bf16_grouped_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor bf16bf16bf16_grouped_256_128_128_2_1_1_f(
+    at::Tensor X, // BF16
+    at::Tensor W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<at::Tensor, 256, 128, 128, 2, 1, 1, false>(
+      X, W, output, zero_start_index_M, M_sizes);
+}
+
+at::Tensor bf16bf16bf16_grouped_256_128_128_2_1_1_f(
+    at::TensorList X, // BF16
+    at::TensorList W, // BF16
+    at::Tensor output,
+    std::optional<at::Tensor> zero_start_index_M,
+    std::optional<at::Tensor> M_sizes) {
+  return bf16bf16bf16_grouped_impl<
+      at::TensorList,
+      256,
+      128,
+      128,
+      2,
+      1,
+      1,
+      false>(X, W, output, zero_start_index_M, M_sizes);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_common.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_common.cuh
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped/bf16bf16bf16_grouped_manifest.cuh