Add Ampere bfloat-float example

codeplaysoftware · May 31, 2024 · 74cbb6d · 74cbb6d
1 parent 3b8108c
commit 74cbb6d
Show file tree

Hide file tree

Showing 10 changed files with 95 additions and 332 deletions.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -66,11 +66,6 @@ endfunction()
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
-endif()
-if (SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
 endif()
 if (SYCL_NVIDIA_TARGET)
   add_subdirectory(ampere)

diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt
@@ -31,3 +31,8 @@ cutlass_benchmark_add_executable(
   bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
   bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
 )
+
+cutlass_benchmark_add_executable(
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
+)
diff --git a/...re_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp → ...re_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp b/...re_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp → ...re_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 
-#include "../common/example_runner.hpp"
+#include "../common/benchmark_runner.hpp"
 #include "gemm_configuration.hpp"
 
 int main(int argc, const char** argv)
@@ -53,7 +53,7 @@ int main(int argc, const char** argv)
   }
 
   //
-  // Run examples
+  // Run benchmark
   //
 
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
@@ -68,19 +68,19 @@ int main(int argc, const char** argv)
 // elements in input matrices.
   using ElementAccumulator = float;                   // <- data type of accumulator
   using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
-  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
   using ElementOutput = float;                        // <- data type of elements in output matrix D
 
   using LayoutA = cutlass::layout::ColumnMajor;
-  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
   using LayoutC = cutlass::layout::ColumnMajor;
   using LayoutD = cutlass::layout::ColumnMajor;
 
   using TileShape = Shape<_128, _128, _32>;
 
   using TiledMma = TiledMMA<
-          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+          MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>,
           Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
           Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
 
@@ -145,7 +145,7 @@ int main(int argc, const char** argv)
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
-  ExampleRunner<Gemm> runner;
+  BenchmarkRunner<Gemm> runner;
 
   runner.run(options, hw_info);
 

diff --git a/benchmarks/ampere/gemm_configuration.hpp b/benchmarks/ampere/gemm_configuration.hpp
@@ -58,14 +58,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMa
     using SmemLayoutAtom = decltype(
     composition(Swizzle<3,3,3>{},
                 Layout<Shape < _8,_64>,
-                Stride<_64, _1>>{}));
+                        Stride<_64, _1>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_16,_8>,
-                    Stride< _8,_1>>{},
+                            Stride< _8,_1>>{},
                     Layout<Shape < _1,_8>>{}));
 };
 
@@ -77,14 +77,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8
     using SmemLayoutAtom = decltype(
     composition(Swizzle<3,3,3>{},
                 Layout<Shape <_64, _8>,
-                Stride< _1,_64>>{}));
+                        Stride< _1,_64>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_16, _8>,
-                    Stride< _1,_16>>{},
+                            Stride< _1,_16>>{},
                     Layout<Shape < _8, _1>>{}));
 };
 
@@ -96,14 +96,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 3
     using SmemLayoutAtom = decltype(
     composition(Swizzle<2,3,3>{},
                 Layout<Shape < _8,_32>,
-                Stride<_32, _1>>{}));
+                        Stride<_32, _1>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_32,_4>,
-                    Stride< _4,_1>>{},
+                            Stride< _4,_1>>{},
                     Layout<Shape < _1,_8>>{}));
 };
 
@@ -120,3 +120,78 @@ template <int Alignment, int SizeK>
 struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
         : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
 {};
+
+/////////////////////////////////////////////////////////////////////////
+
+// Bfloat
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<cutlass::bfloat16_t, cutlass::layout::RowMajor, 8, 64>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                        Stride<_64, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16,_8>,
+                            Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, 8, SizeK>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                        Stride< _1,_64>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16, _8>,
+                            Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, 8, 32>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                        Stride<_32, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_32,_4>,
+                            Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+{};
diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp
@@ -97,7 +97,7 @@ struct Options {
     /// Prints the usage statement.
     std::ostream & print_usage(std::ostream &out) const {
 
-      out << "PVC GEMM Example\n\n"
+      out << "PVC GEMM Benchmark\n\n"
           << "Options:\n\n"
           << "  --help                      If specified, displays this usage statement\n\n"
           << "  --m=<int>                   Sets the M extent of the GEMM\n"

diff --git a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
@@ -56,7 +56,7 @@ int main(int argc, const char** argv)
   }
 
   //
-  // Run examples
+  // Run benchmark
   //
 
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This

diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
@@ -29,6 +29,4 @@
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
 endif()
diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,7 +56,7 @@ int main(int argc, const char** argv) @@
       }
       //
-      // Run examples
+      // Run benchmark
       //
       // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
@@ Expand Down @@