diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 79d88583fa..d499f839a9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -66,11 +66,6 @@ endfunction()
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
-endif()
-if (SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
 endif()
 if (SYCL_NVIDIA_TARGET)
   add_subdirectory(ampere)
diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt
index 70c2bdc990..666d9cac60 100644
--- a/benchmarks/ampere/CMakeLists.txt
+++ b/benchmarks/ampere/CMakeLists.txt
@@ -31,3 +31,8 @@ cutlass_benchmark_add_executable(
   bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
   bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
 )
+
+cutlass_benchmark_add_executable(
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
+)
diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
similarity index 93%
rename from examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
rename to benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
index a7e212fe80..8dad127417 100644
--- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
+++ b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 
-#include "../common/example_runner.hpp"
+#include "../common/benchmark_runner.hpp"
 #include "gemm_configuration.hpp"
 
 int main(int argc, const char** argv)
@@ -53,7 +53,7 @@ int main(int argc, const char** argv)
   }
 
   //
-  // Run examples
+  // Run benchmark
   //
 
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
@@ -68,19 +68,19 @@ int main(int argc, const char** argv)
 // elements in input matrices.
   using ElementAccumulator = float;                   // <- data type of accumulator
   using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
-  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
   using ElementOutput = float;                        // <- data type of elements in output matrix D
 
   using LayoutA = cutlass::layout::ColumnMajor;
-  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
   using LayoutC = cutlass::layout::ColumnMajor;
   using LayoutD = cutlass::layout::ColumnMajor;
 
   using TileShape = Shape<_128, _128, _32>;
 
   using TiledMma = TiledMMA<
-          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+          MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>,
           Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
           Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
 
@@ -145,7 +145,7 @@ int main(int argc, const char** argv)
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
-  ExampleRunner<Gemm> runner;
+  BenchmarkRunner<Gemm> runner;
 
   runner.run(options, hw_info);
 
diff --git a/benchmarks/ampere/gemm_configuration.hpp b/benchmarks/ampere/gemm_configuration.hpp
index 8a32e77e7f..484786567f 100644
--- a/benchmarks/ampere/gemm_configuration.hpp
+++ b/benchmarks/ampere/gemm_configuration.hpp
@@ -58,14 +58,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMa
     using SmemLayoutAtom = decltype(
     composition(Swizzle<3,3,3>{},
                 Layout<Shape < _8,_64>,
-                Stride<_64, _1>>{}));
+                        Stride<_64, _1>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_16,_8>,
-                    Stride< _8,_1>>{},
+                            Stride< _8,_1>>{},
                     Layout<Shape < _1,_8>>{}));
 };
 
@@ -77,14 +77,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8
     using SmemLayoutAtom = decltype(
     composition(Swizzle<3,3,3>{},
                 Layout<Shape <_64, _8>,
-                Stride< _1,_64>>{}));
+                        Stride< _1,_64>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_16, _8>,
-                    Stride< _1,_16>>{},
+                            Stride< _1,_16>>{},
                     Layout<Shape < _8, _1>>{}));
 };
 
@@ -96,14 +96,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 3
     using SmemLayoutAtom = decltype(
     composition(Swizzle<2,3,3>{},
                 Layout<Shape < _8,_32>,
-                Stride<_32, _1>>{}));
+                        Stride<_32, _1>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_32,_4>,
-                    Stride< _4,_1>>{},
+                            Stride< _4,_1>>{},
                     Layout<Shape < _1,_8>>{}));
 };
 
@@ -120,3 +120,78 @@ template <int Alignment, int SizeK>
 struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
         : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
 {};
+
+/////////////////////////////////////////////////////////////////////////
+
+// Bfloat
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<cutlass::bfloat16_t, cutlass::layout::RowMajor, 8, 64>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                        Stride<_64, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16,_8>,
+                            Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, 8, SizeK>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                        Stride< _1,_64>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16, _8>,
+                            Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, 8, 32>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                        Stride<_32, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_32,_4>,
+                            Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+{};
diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp
index e3d5d8f3a5..5eb2ade3eb 100644
--- a/benchmarks/common/benchmark_runner.hpp
+++ b/benchmarks/common/benchmark_runner.hpp
@@ -97,7 +97,7 @@ struct Options {
     /// Prints the usage statement.
     std::ostream & print_usage(std::ostream &out) const {
 
-      out << "PVC GEMM Example\n\n"
+      out << "PVC GEMM Benchmark\n\n"
           << "Options:\n\n"
           << "  --help                      If specified, displays this usage statement\n\n"
           << "  --m=<int>                   Sets the M extent of the GEMM\n"
diff --git a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
index 2ff30d7a79..67b76929db 100644
--- a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
+++ b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
@@ -56,7 +56,7 @@ int main(int argc, const char** argv)
   }
 
   //
-  // Run examples
+  // Run benchmark
   //
 
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index 8dac4f6c55..b736ce35e8 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -29,6 +29,4 @@
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
 endif()
diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt
deleted file mode 100644
index d25c28fc1d..0000000000
--- a/examples/sycl/ampere/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-cutlass_example_add_executable(
-  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
-  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
-)
diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
deleted file mode 100644
index 67aabf76d3..0000000000
--- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "../common/example_runner.hpp"
-#include "gemm_configuration.hpp"
-
-int main(int argc, const char** argv)
-{
-  //
-  // Parse options
-  //
-
-  Options options;
-
-  options.parse(argc, argv);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  if (options.error) {
-    std::cerr << "Aborting execution." << std::endl;
-    return -1;
-  }
-
-  //
-  // Run examples
-  //
-
-  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
-  // information is used by the underlying kernel.
-  cutlass::KernelHardwareInfo hw_info;
-
-  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
-  // to use a GPU other than that with device ID 0.
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-
-  bool passed;
-
-// The code section below describes datatype for input, output matrices and computation between
-// elements in input matrices.
-  using ElementAccumulator = float;                   // <- data type of accumulator
-  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
-  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
-  using ElementOutput = float;                        // <- data type of elements in output matrix D
-
-  using LayoutA = cutlass::layout::ColumnMajor;
-  using LayoutB = cutlass::layout::RowMajor;
-  using LayoutC = cutlass::layout::ColumnMajor;
-  using LayoutD = cutlass::layout::ColumnMajor;
-
-  using TileShape = Shape<_128, _128, _32>;
-
-  using TiledMma = TiledMMA<
-          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
-          Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
-          Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
-
-  static constexpr int kAlignmentA = 8;
-  using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA<
-          ElementInputA, LayoutA, kAlignmentA, 32>;
-  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
-  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
-  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
-
-  static constexpr int kAlignmentB = 8;
-  using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB<
-          ElementInputB, LayoutB, kAlignmentB, 32>;
-  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
-  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
-  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
-
-  using Stages = Int<3>;
-
-  // This code section describes the epilogue part of the kernel
-  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
-          ElementOutput,                                     // <- data type of output matrix
-          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
-          // memory access. For a byte, it's 16
-          // elements. This becomes the vector width of
-          // math instructions in the epilogue too
-          ElementAccumulator,                                // <- data type of accumulator
-          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
-
-  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<Stages{}>;
-
-  // Define strides (mixed)
-  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
-  using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
-  using StrideC = cutlass::detail::TagToStrideC_t<LayoutC>;
-  using StrideD = cutlass::detail::TagToStrideC_t<LayoutD>;
-
-  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
-          StrideC,
-          StrideD,
-          EpilogueOp,
-          cutlass::gemm::EpilogueDefault>;
-
-  // Mainloop
-  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
-          DispatchPolicy,
-          TileShape,
-          ElementInputA,
-          StrideA,
-          ElementInputB,
-          StrideB,
-          TiledMma,
-          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
-          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
-  >;
-
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-          Shape<int, int, int, int>,
-          CollectiveMainloop,
-          CollectiveEpilogue
-  >;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  ExampleRunner<Gemm> runner;
-
-  runner.run(options, hw_info);
-
-  return 0;
-}
diff --git a/examples/sycl/ampere/gemm_configuration.hpp b/examples/sycl/ampere/gemm_configuration.hpp
deleted file mode 100644
index 8a32e77e7f..0000000000
--- a/examples/sycl/ampere/gemm_configuration.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass/half.h"
-#include "cutlass/layout/layout.h"
-
-#include "cute/swizzle.hpp"
-#include "cute/layout.hpp"
-#include "cute/arch/copy_sm75.hpp"
-#include "cute/arch/copy_sm80.hpp"
-#include "cute/atom/copy_atom.hpp"
-
-using namespace cute;
-
-template <typename Element, typename Layout, int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandA;
-
-template <typename Element, typename Layout, int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB;
-
-/////////////////////////////////////////////////////////////////////////
-
-// half
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMajor, 8, 64>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape < _8,_64>,
-                Stride<_64, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_16,_8>,
-                    Stride< _8,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-/// Operand A - Column-major (M-major)
-template <int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8, SizeK>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape <_64, _8>,
-                Stride< _1,_64>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_16, _8>,
-                    Stride< _1,_16>>{},
-                    Layout<Shape < _8, _1>>{}));
-};
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 32>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<2,3,3>{},
-                Layout<Shape < _8,_32>,
-                Stride<_32, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_32,_4>,
-                    Stride< _4,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
-
-// Operand B - Column-Major (K-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
-{};
-
-// Operand B - Row-Major (N-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-{};