From dfc0c886a887916983f9ddb277d5cc02326a8dea Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 13:58:18 +0100
Subject: [PATCH 01/10] Add generic example runner

---
 examples/sycl/common/example_runner.hpp | 387 ++++++++++++++++++++++++
 1 file changed, 387 insertions(+)
 create mode 100644 examples/sycl/common/example_runner.hpp
diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp
new file mode 100644
index 0000000000..f300012caa
--- /dev/null
+++ b/examples/sycl/common/example_runner.hpp
@@ -0,0 +1,387 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cute/tensor.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/print_error.hpp"
+
+template <typename T>
+static void fill_matrix(std::vector<T> &M)
+{
+  std::generate(std::begin(M), std::end(M), [&]
+  { return static_cast<T>( 2*(rand() / double(RAND_MAX)) - 1 ); });
+}
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+    bool help;
+    bool error;
+
+    int m, n, k, l, iterations;
+    float alpha, beta;
+
+    Options():
+            help(false),
+            error(false),
+            m(4096), n(4096), k(4096), l(1), iterations(100),
+            alpha(1.f), beta(0.f)
+    { }
+
+    // Parses the command line
+    void parse(int argc, char const **args) {
+      cutlass::CommandLine cmd(argc, args);
+
+      if (cmd.check_cmd_line_flag("help")) {
+        help = true;
+        return;
+      }
+
+      cmd.get_cmd_line_argument("m", m, 4096);
+      cmd.get_cmd_line_argument("n", n, 4096);
+      cmd.get_cmd_line_argument("k", k, 4096);
+      cmd.get_cmd_line_argument("l", l, 1);
+      cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+      cmd.get_cmd_line_argument("beta", beta, 0.f);
+      cmd.get_cmd_line_argument("iterations", iterations, 100);
+    }
+
+    /// Prints the usage statement.
+    std::ostream & print_usage(std::ostream &out) const {
+
+      out << "PVC GEMM Example\n\n"
+          << "Options:\n\n"
+          << "  --help                      If specified, displays this usage statement\n\n"
+          << "  --m=<int>                   Sets the M extent of the GEMM\n"
+          << "  --n=<int>                   Sets the N extent of the GEMM\n"
+          << "  --k=<int>                   Sets the K extent of the GEMM\n"
+          << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+          << "  --alpha=<s32>               Epilogue scalar alpha\n"
+          << "  --beta=<s32>                Epilogue scalar beta\n\n"
+          << "  --iterations=<int>          Iterations\n\n";
+
+      return out;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Gemm>
+struct ExampleRunner {
+
+    using StrideA = typename Gemm::GemmKernel::StrideA;
+    using StrideB = typename Gemm::GemmKernel::StrideB;
+    using StrideC = typename Gemm::GemmKernel::StrideC;
+    using StrideD = typename Gemm::GemmKernel::StrideD;
+
+    using LayoutA = typename Gemm::LayoutA;
+    using LayoutB = typename Gemm::LayoutB;
+    using LayoutC = typename Gemm::LayoutC;
+    using LayoutD = typename Gemm::LayoutD;
+
+    using ElementA = typename Gemm::ElementA;
+    using ElementB = typename Gemm::ElementB;
+    using ElementAcc = typename Gemm::ElementAccumulator;
+
+    using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+    using ElementC = typename Gemm::ElementC;
+    using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+    using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+    using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+    using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+    //
+    // Data members
+    //
+
+    /// Initialization
+    StrideA stride_A;
+    StrideB stride_B;
+    StrideC stride_C;
+    StrideD stride_D;
+
+    cutlass::DeviceAllocation<ElementA> block_A;
+    cutlass::DeviceAllocation<ElementB> block_B;
+    cutlass::DeviceAllocation<ElementC> block_C;
+    cutlass::DeviceAllocation<ElementOutput> block_D;
+    cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+    //
+    // Methods
+    //
+
+    bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+      auto [M, N, K, L] = problem_size;
+
+      cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+      cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+      cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+      cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+      cutlass::reference::device::GemmComplex(
+              {M, N, K},
+              alpha,
+              ref_A,
+              cutlass::ComplexTransform::kNone,
+              ref_B,
+              cutlass::ComplexTransform::kNone,
+              beta,
+              ref_C,
+              ref_D,
+              ElementAccumulator(0),
+              L,     // batch_count
+              M * K, // batch_stride_A
+              K * N, // batch_stride_B
+              M * N, // batch_stride_C
+              M * N  // batch_stride_D
+      );
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
+      // need to set a larger error margin for comparison to succeed
+      auto epsilon = static_cast<ElementOutput>(0.1f);
+      auto nonzero_floor = static_cast<ElementOutput>(0.1f);
+
+      bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+              block_ref_D.get(), block_D.get(), block_D.size(),
+              epsilon, nonzero_floor);
+
+      return passed;
+    }
+
+    /// Initialize operands to be used in the GEMM and reference GEMM
+    virtual void initialize(const ProblemShapeType& problem_size) {
+      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+      stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+      stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+      stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+      block_A.reset(M * K * L);
+      block_B.reset(K * N * L);
+      block_C.reset(M * N * L);
+      block_D.reset(M * N * L);
+      block_ref_D.reset(M * N * L);
+
+      // TODO: Enable initialization on device directly once RNG is
+      // available through SYCL.
+      std::vector<ElementA> a(K * M * L);
+      std::vector<ElementB> b(K * N * L);
+      std::vector<ElementC> c(M * N * L);
+      std::vector<ElementC> d(M * N * L, ElementC{0});
+
+      fill_matrix(a);
+      fill_matrix(b);
+      fill_matrix(c);
+
+      block_A.copy_from_host(a.data(), a.size());
+      block_B.copy_from_host(b.data(), b.size());
+      block_C.copy_from_host(c.data(), c.size());
+      block_D.copy_from_host(d.data(), d.size());
+      block_ref_D.copy_from_host(d.data(), d.size());
+    }
+
+    virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+      initialize(problem_size);
+
+      typename Gemm::GemmKernel::Arguments arguments{
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              problem_size,
+              {block_A.get(), stride_A, block_B.get(), stride_B},
+              {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+              hw_info
+      };
+
+      Gemm gemm_op;
+
+      size_t workspace_size = Gemm::get_workspace_size(arguments);
+      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+      gemm_op.can_implement(arguments);
+
+      gemm_op.initialize(arguments, workspace.get());
+
+      // Run the GEMM
+      gemm_op.run();
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Verify that the result is correct
+      bool passed = verify(problem_size, options.alpha, options.beta);
+      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+      if (passed && options.iterations > 0) {
+        GPU_Clock timer;
+        timer.start();
+        for (int i = 0; i < options.iterations; ++i) {
+          gemm_op.run();
+        }
+
+        float cute_time = timer.seconds() / options.iterations;
+        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
+                  << std::endl;
+        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time * 1000);
+      }
+    }
+};
+
+template <class Gemm>
+struct PvcExampleRunner : ExampleRunner<Gemm> {
+    using Base = ExampleRunner<Gemm>;
+
+    using ElementB = typename Base::ElementB;
+
+    using ProblemShapeType = typename Base::ProblemShapeType;
+
+    cutlass::DeviceAllocation<ElementB> block_B_vnni;
+
+    template <typename T>
+    void vnni_matrix(
+            T* dst, const T* src,
+            int batch, int numRows, int numCols, int factor)
+    {
+      for (int b = 0; b < batch; b++) {
+        for (int r = 0; r < numRows / factor; r++) {
+          for (int c = 0; c < numCols; c++) {
+            for (int k = 0; k < factor; k++) {
+              dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
+                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
+            }
+          }
+        }
+      }
+    }
+
+    void initialize(const ProblemShapeType& problem_size) override {
+      Base::initialize(problem_size);
+
+      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      block_B_vnni.reset(Base::block_B.size());
+
+      std::vector<ElementB> b(K * N * L);
+      std::vector<ElementB> b_vnni(b.size());
+
+      Base::block_B.copy_to_host(b.data());
+      vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
+
+      block_B_vnni.copy_from_host(b_vnni.data());
+    }
+
+    void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override {
+      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+      initialize(problem_size);
+
+      typename Gemm::GemmKernel::Arguments arguments{
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              problem_size,
+              {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B},
+              {
+                {options.alpha, options.beta},
+                Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D
+              },
+              hw_info
+      };
+
+      Gemm gemm_op;
+
+      size_t workspace_size = Gemm::get_workspace_size(arguments);
+      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+      gemm_op.can_implement(arguments);
+
+      gemm_op.initialize(arguments, workspace.get());
+
+      // Run the GEMM
+      gemm_op.run();
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Verify that the result is correct
+      bool passed = Base::verify(problem_size, options.alpha, options.beta);
+      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+      if (passed && options.iterations > 0) {
+        GPU_Clock timer;
+        timer.start();
+        for (int i = 0; i < options.iterations; ++i) {
+          gemm_op.run();
+        }
+
+        float cute_time = timer.seconds() / options.iterations;
+        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+      }
+    }
+};
+

From fd35e3ab71e14795602adf635e7409d0cbf346cc Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 16:24:54 +0100
Subject: [PATCH 02/10] Init d and ref_d with different values

---
 examples/sycl/common/example_runner.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp
index f300012caa..2cc14556fe 100644
--- a/examples/sycl/common/example_runner.hpp
+++ b/examples/sycl/common/example_runner.hpp
@@ -224,7 +224,8 @@ struct ExampleRunner {
       std::vector<ElementA> a(K * M * L);
       std::vector<ElementB> b(K * N * L);
       std::vector<ElementC> c(M * N * L);
-      std::vector<ElementC> d(M * N * L, ElementC{0});
+      std::vector<ElementC> d(M * N * L, ElementC{-1});
+      std::vector<ElementC> ref_d(M * N * L, ElementC{-2});
 
       fill_matrix(a);
       fill_matrix(b);
@@ -234,7 +235,7 @@ struct ExampleRunner {
       block_B.copy_from_host(b.data(), b.size());
       block_C.copy_from_host(c.data(), c.size());
       block_D.copy_from_host(d.data(), d.size());
-      block_ref_D.copy_from_host(d.data(), d.size());
+      block_ref_D.copy_from_host(ref_d.data(), d.size());
     }
 
     virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {

From 45d47e4895766067d6aeb21c6fdc49f4c0c7d669 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 27 May 2024 16:03:41 +0100
Subject: [PATCH 03/10] Move runner to benchmark folder

---
 examples/sycl/common/example_runner.hpp | 388 ------------------------
 1 file changed, 388 deletions(-)
 delete mode 100644 examples/sycl/common/example_runner.hpp

diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp
deleted file mode 100644
index 2cc14556fe..0000000000
--- a/examples/sycl/common/example_runner.hpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/gemm/device/gemm_universal.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/collective/collective_mma.hpp"
-#include "cutlass/util/GPU_Clock.hpp"
-
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_copy.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cute/tensor.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/reference/device/gemm_complex.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/print_error.hpp"
-
-template <typename T>
-static void fill_matrix(std::vector<T> &M)
-{
-  std::generate(std::begin(M), std::end(M), [&]
-  { return static_cast<T>( 2*(rand() / double(RAND_MAX)) - 1 ); });
-}
-
-using namespace cute;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Command line options parsing
-struct Options {
-
-    bool help;
-    bool error;
-
-    int m, n, k, l, iterations;
-    float alpha, beta;
-
-    Options():
-            help(false),
-            error(false),
-            m(4096), n(4096), k(4096), l(1), iterations(100),
-            alpha(1.f), beta(0.f)
-    { }
-
-    // Parses the command line
-    void parse(int argc, char const **args) {
-      cutlass::CommandLine cmd(argc, args);
-
-      if (cmd.check_cmd_line_flag("help")) {
-        help = true;
-        return;
-      }
-
-      cmd.get_cmd_line_argument("m", m, 4096);
-      cmd.get_cmd_line_argument("n", n, 4096);
-      cmd.get_cmd_line_argument("k", k, 4096);
-      cmd.get_cmd_line_argument("l", l, 1);
-      cmd.get_cmd_line_argument("alpha", alpha, 1.f);
-      cmd.get_cmd_line_argument("beta", beta, 0.f);
-      cmd.get_cmd_line_argument("iterations", iterations, 100);
-    }
-
-    /// Prints the usage statement.
-    std::ostream & print_usage(std::ostream &out) const {
-
-      out << "PVC GEMM Example\n\n"
-          << "Options:\n\n"
-          << "  --help                      If specified, displays this usage statement\n\n"
-          << "  --m=<int>                   Sets the M extent of the GEMM\n"
-          << "  --n=<int>                   Sets the N extent of the GEMM\n"
-          << "  --k=<int>                   Sets the K extent of the GEMM\n"
-          << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
-          << "  --alpha=<s32>               Epilogue scalar alpha\n"
-          << "  --beta=<s32>                Epilogue scalar beta\n\n"
-          << "  --iterations=<int>          Iterations\n\n";
-
-      return out;
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class Gemm>
-struct ExampleRunner {
-
-    using StrideA = typename Gemm::GemmKernel::StrideA;
-    using StrideB = typename Gemm::GemmKernel::StrideB;
-    using StrideC = typename Gemm::GemmKernel::StrideC;
-    using StrideD = typename Gemm::GemmKernel::StrideD;
-
-    using LayoutA = typename Gemm::LayoutA;
-    using LayoutB = typename Gemm::LayoutB;
-    using LayoutC = typename Gemm::LayoutC;
-    using LayoutD = typename Gemm::LayoutD;
-
-    using ElementA = typename Gemm::ElementA;
-    using ElementB = typename Gemm::ElementB;
-    using ElementAcc = typename Gemm::ElementAccumulator;
-
-    using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
-    using ElementC = typename Gemm::ElementC;
-    using ElementOutput = typename CollectiveEpilogue::ElementOutput;
-    using ElementCompute = typename CollectiveEpilogue::ElementCompute;
-    using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
-
-    using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
-
-    //
-    // Data members
-    //
-
-    /// Initialization
-    StrideA stride_A;
-    StrideB stride_B;
-    StrideC stride_C;
-    StrideD stride_D;
-
-    cutlass::DeviceAllocation<ElementA> block_A;
-    cutlass::DeviceAllocation<ElementB> block_B;
-    cutlass::DeviceAllocation<ElementC> block_C;
-    cutlass::DeviceAllocation<ElementOutput> block_D;
-    cutlass::DeviceAllocation<ElementOutput> block_ref_D;
-
-    //
-    // Methods
-    //
-
-    bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
-      auto [M, N, K, L] = problem_size;
-
-      cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
-      cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
-      cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
-      cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
-
-      cutlass::reference::device::GemmComplex(
-              {M, N, K},
-              alpha,
-              ref_A,
-              cutlass::ComplexTransform::kNone,
-              ref_B,
-              cutlass::ComplexTransform::kNone,
-              beta,
-              ref_C,
-              ref_D,
-              ElementAccumulator(0),
-              L,     // batch_count
-              M * K, // batch_stride_A
-              K * N, // batch_stride_B
-              M * N, // batch_stride_C
-              M * N  // batch_stride_D
-      );
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
-      // need to set a larger error margin for comparison to succeed
-      auto epsilon = static_cast<ElementOutput>(0.1f);
-      auto nonzero_floor = static_cast<ElementOutput>(0.1f);
-
-      bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
-              block_ref_D.get(), block_D.get(), block_D.size(),
-              epsilon, nonzero_floor);
-
-      return passed;
-    }
-
-    /// Initialize operands to be used in the GEMM and reference GEMM
-    virtual void initialize(const ProblemShapeType& problem_size) {
-      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-      stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
-      stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
-      stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
-
-      block_A.reset(M * K * L);
-      block_B.reset(K * N * L);
-      block_C.reset(M * N * L);
-      block_D.reset(M * N * L);
-      block_ref_D.reset(M * N * L);
-
-      // TODO: Enable initialization on device directly once RNG is
-      // available through SYCL.
-      std::vector<ElementA> a(K * M * L);
-      std::vector<ElementB> b(K * N * L);
-      std::vector<ElementC> c(M * N * L);
-      std::vector<ElementC> d(M * N * L, ElementC{-1});
-      std::vector<ElementC> ref_d(M * N * L, ElementC{-2});
-
-      fill_matrix(a);
-      fill_matrix(b);
-      fill_matrix(c);
-
-      block_A.copy_from_host(a.data(), a.size());
-      block_B.copy_from_host(b.data(), b.size());
-      block_C.copy_from_host(c.data(), c.size());
-      block_D.copy_from_host(d.data(), d.size());
-      block_ref_D.copy_from_host(ref_d.data(), d.size());
-    }
-
-    virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
-      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
-
-      initialize(problem_size);
-
-      typename Gemm::GemmKernel::Arguments arguments{
-              cutlass::gemm::GemmUniversalMode::kGemm,
-              problem_size,
-              {block_A.get(), stride_A, block_B.get(), stride_B},
-              {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
-              hw_info
-      };
-
-      Gemm gemm_op;
-
-      size_t workspace_size = Gemm::get_workspace_size(arguments);
-      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-      gemm_op.can_implement(arguments);
-
-      gemm_op.initialize(arguments, workspace.get());
-
-      // Run the GEMM
-      gemm_op.run();
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Verify that the result is correct
-      bool passed = verify(problem_size, options.alpha, options.beta);
-      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-      if (passed && options.iterations > 0) {
-        GPU_Clock timer;
-        timer.start();
-        for (int i = 0; i < options.iterations; ++i) {
-          gemm_op.run();
-        }
-
-        float cute_time = timer.seconds() / options.iterations;
-        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
-        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
-                  << std::endl;
-        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time * 1000);
-      }
-    }
-};
-
-template <class Gemm>
-struct PvcExampleRunner : ExampleRunner<Gemm> {
-    using Base = ExampleRunner<Gemm>;
-
-    using ElementB = typename Base::ElementB;
-
-    using ProblemShapeType = typename Base::ProblemShapeType;
-
-    cutlass::DeviceAllocation<ElementB> block_B_vnni;
-
-    template <typename T>
-    void vnni_matrix(
-            T* dst, const T* src,
-            int batch, int numRows, int numCols, int factor)
-    {
-      for (int b = 0; b < batch; b++) {
-        for (int r = 0; r < numRows / factor; r++) {
-          for (int c = 0; c < numCols; c++) {
-            for (int k = 0; k < factor; k++) {
-              dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
-                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
-            }
-          }
-        }
-      }
-    }
-
-    void initialize(const ProblemShapeType& problem_size) override {
-      Base::initialize(problem_size);
-
-      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      block_B_vnni.reset(Base::block_B.size());
-
-      std::vector<ElementB> b(K * N * L);
-      std::vector<ElementB> b_vnni(b.size());
-
-      Base::block_B.copy_to_host(b.data());
-      vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
-
-      block_B_vnni.copy_from_host(b_vnni.data());
-    }
-
-    void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override {
-      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
-
-      initialize(problem_size);
-
-      typename Gemm::GemmKernel::Arguments arguments{
-              cutlass::gemm::GemmUniversalMode::kGemm,
-              problem_size,
-              {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B},
-              {
-                {options.alpha, options.beta},
-                Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D
-              },
-              hw_info
-      };
-
-      Gemm gemm_op;
-
-      size_t workspace_size = Gemm::get_workspace_size(arguments);
-      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-      gemm_op.can_implement(arguments);
-
-      gemm_op.initialize(arguments, workspace.get());
-
-      // Run the GEMM
-      gemm_op.run();
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Verify that the result is correct
-      bool passed = Base::verify(problem_size, options.alpha, options.beta);
-      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-      if (passed && options.iterations > 0) {
-        GPU_Clock timer;
-        timer.start();
-        for (int i = 0; i < options.iterations; ++i) {
-          gemm_op.run();
-        }
-
-        float cute_time = timer.seconds() / options.iterations;
-        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
-        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
-        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
-      }
-    }
-};
-

From c763cf04998e63546d3126bddcdcf5156b390af9 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 13:58:18 +0100
Subject: [PATCH 04/10] Add generic example runner

---
 examples/sycl/common/example_runner.hpp | 387 ++++++++++++++++++++++++
 1 file changed, 387 insertions(+)
 create mode 100644 examples/sycl/common/example_runner.hpp

diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp
new file mode 100644
index 0000000000..f300012caa
--- /dev/null
+++ b/examples/sycl/common/example_runner.hpp
@@ -0,0 +1,387 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cute/tensor.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/print_error.hpp"
+
+template <typename T>
+static void fill_matrix(std::vector<T> &M)
+{
+  std::generate(std::begin(M), std::end(M), [&]
+  { return static_cast<T>( 2*(rand() / double(RAND_MAX)) - 1 ); });
+}
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+    bool help;
+    bool error;
+
+    int m, n, k, l, iterations;
+    float alpha, beta;
+
+    Options():
+            help(false),
+            error(false),
+            m(4096), n(4096), k(4096), l(1), iterations(100),
+            alpha(1.f), beta(0.f)
+    { }
+
+    // Parses the command line
+    void parse(int argc, char const **args) {
+      cutlass::CommandLine cmd(argc, args);
+
+      if (cmd.check_cmd_line_flag("help")) {
+        help = true;
+        return;
+      }
+
+      cmd.get_cmd_line_argument("m", m, 4096);
+      cmd.get_cmd_line_argument("n", n, 4096);
+      cmd.get_cmd_line_argument("k", k, 4096);
+      cmd.get_cmd_line_argument("l", l, 1);
+      cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+      cmd.get_cmd_line_argument("beta", beta, 0.f);
+      cmd.get_cmd_line_argument("iterations", iterations, 100);
+    }
+
+    /// Prints the usage statement.
+    std::ostream & print_usage(std::ostream &out) const {
+
+      out << "PVC GEMM Example\n\n"
+          << "Options:\n\n"
+          << "  --help                      If specified, displays this usage statement\n\n"
+          << "  --m=<int>                   Sets the M extent of the GEMM\n"
+          << "  --n=<int>                   Sets the N extent of the GEMM\n"
+          << "  --k=<int>                   Sets the K extent of the GEMM\n"
+          << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+          << "  --alpha=<s32>               Epilogue scalar alpha\n"
+          << "  --beta=<s32>                Epilogue scalar beta\n\n"
+          << "  --iterations=<int>          Iterations\n\n";
+
+      return out;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Gemm>
+struct ExampleRunner {
+
+    using StrideA = typename Gemm::GemmKernel::StrideA;
+    using StrideB = typename Gemm::GemmKernel::StrideB;
+    using StrideC = typename Gemm::GemmKernel::StrideC;
+    using StrideD = typename Gemm::GemmKernel::StrideD;
+
+    using LayoutA = typename Gemm::LayoutA;
+    using LayoutB = typename Gemm::LayoutB;
+    using LayoutC = typename Gemm::LayoutC;
+    using LayoutD = typename Gemm::LayoutD;
+
+    using ElementA = typename Gemm::ElementA;
+    using ElementB = typename Gemm::ElementB;
+    using ElementAcc = typename Gemm::ElementAccumulator;
+
+    using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+    using ElementC = typename Gemm::ElementC;
+    using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+    using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+    using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+    using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+    //
+    // Data members
+    //
+
+    /// Initialization
+    StrideA stride_A;
+    StrideB stride_B;
+    StrideC stride_C;
+    StrideD stride_D;
+
+    cutlass::DeviceAllocation<ElementA> block_A;
+    cutlass::DeviceAllocation<ElementB> block_B;
+    cutlass::DeviceAllocation<ElementC> block_C;
+    cutlass::DeviceAllocation<ElementOutput> block_D;
+    cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+    //
+    // Methods
+    //
+
+    bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+      auto [M, N, K, L] = problem_size;
+
+      cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+      cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+      cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+      cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+      cutlass::reference::device::GemmComplex(
+              {M, N, K},
+              alpha,
+              ref_A,
+              cutlass::ComplexTransform::kNone,
+              ref_B,
+              cutlass::ComplexTransform::kNone,
+              beta,
+              ref_C,
+              ref_D,
+              ElementAccumulator(0),
+              L,     // batch_count
+              M * K, // batch_stride_A
+              K * N, // batch_stride_B
+              M * N, // batch_stride_C
+              M * N  // batch_stride_D
+      );
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
+      // need to set a larger error margin for comparison to succeed
+      auto epsilon = static_cast<ElementOutput>(0.1f);
+      auto nonzero_floor = static_cast<ElementOutput>(0.1f);
+
+      bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+              block_ref_D.get(), block_D.get(), block_D.size(),
+              epsilon, nonzero_floor);
+
+      return passed;
+    }
+
+    /// Initialize operands to be used in the GEMM and reference GEMM
+    virtual void initialize(const ProblemShapeType& problem_size) {
+      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+      stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+      stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+      stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+      block_A.reset(M * K * L);
+      block_B.reset(K * N * L);
+      block_C.reset(M * N * L);
+      block_D.reset(M * N * L);
+      block_ref_D.reset(M * N * L);
+
+      // TODO: Enable initialization on device directly once RNG is
+      // available through SYCL.
+      std::vector<ElementA> a(K * M * L);
+      std::vector<ElementB> b(K * N * L);
+      std::vector<ElementC> c(M * N * L);
+      std::vector<ElementC> d(M * N * L, ElementC{0});
+
+      fill_matrix(a);
+      fill_matrix(b);
+      fill_matrix(c);
+
+      block_A.copy_from_host(a.data(), a.size());
+      block_B.copy_from_host(b.data(), b.size());
+      block_C.copy_from_host(c.data(), c.size());
+      block_D.copy_from_host(d.data(), d.size());
+      block_ref_D.copy_from_host(d.data(), d.size());
+    }
+
+    virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+      initialize(problem_size);
+
+      typename Gemm::GemmKernel::Arguments arguments{
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              problem_size,
+              {block_A.get(), stride_A, block_B.get(), stride_B},
+              {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+              hw_info
+      };
+
+      Gemm gemm_op;
+
+      size_t workspace_size = Gemm::get_workspace_size(arguments);
+      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+      gemm_op.can_implement(arguments);
+
+      gemm_op.initialize(arguments, workspace.get());
+
+      // Run the GEMM
+      gemm_op.run();
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Verify that the result is correct
+      bool passed = verify(problem_size, options.alpha, options.beta);
+      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+      if (passed && options.iterations > 0) {
+        GPU_Clock timer;
+        timer.start();
+        for (int i = 0; i < options.iterations; ++i) {
+          gemm_op.run();
+        }
+
+        float cute_time = timer.seconds() / options.iterations;
+        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
+                  << std::endl;
+        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time * 1000);
+      }
+    }
+};
+
+template <class Gemm>
+struct PvcExampleRunner : ExampleRunner<Gemm> {
+    using Base = ExampleRunner<Gemm>;
+
+    using ElementB = typename Base::ElementB;
+
+    using ProblemShapeType = typename Base::ProblemShapeType;
+
+    cutlass::DeviceAllocation<ElementB> block_B_vnni;
+
+    template <typename T>
+    void vnni_matrix(
+            T* dst, const T* src,
+            int batch, int numRows, int numCols, int factor)
+    {
+      for (int b = 0; b < batch; b++) {
+        for (int r = 0; r < numRows / factor; r++) {
+          for (int c = 0; c < numCols; c++) {
+            for (int k = 0; k < factor; k++) {
+              dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
+                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
+            }
+          }
+        }
+      }
+    }
+
+    void initialize(const ProblemShapeType& problem_size) override {
+      Base::initialize(problem_size);
+
+      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      block_B_vnni.reset(Base::block_B.size());
+
+      std::vector<ElementB> b(K * N * L);
+      std::vector<ElementB> b_vnni(b.size());
+
+      Base::block_B.copy_to_host(b.data());
+      vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
+
+      block_B_vnni.copy_from_host(b_vnni.data());
+    }
+
+    void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override {
+      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+      initialize(problem_size);
+
+      typename Gemm::GemmKernel::Arguments arguments{
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              problem_size,
+              {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B},
+              {
+                {options.alpha, options.beta},
+                Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D
+              },
+              hw_info
+      };
+
+      Gemm gemm_op;
+
+      size_t workspace_size = Gemm::get_workspace_size(arguments);
+      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+      gemm_op.can_implement(arguments);
+
+      gemm_op.initialize(arguments, workspace.get());
+
+      // Run the GEMM
+      gemm_op.run();
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Verify that the result is correct
+      bool passed = Base::verify(problem_size, options.alpha, options.beta);
+      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+      if (passed && options.iterations > 0) {
+        GPU_Clock timer;
+        timer.start();
+        for (int i = 0; i < options.iterations; ++i) {
+          gemm_op.run();
+        }
+
+        float cute_time = timer.seconds() / options.iterations;
+        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+      }
+    }
+};
+

From a0dc044ba85a3db85c77ad9f3bb8c673b72e853f Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 14:23:37 +0100
Subject: [PATCH 05/10] Add Ampere half-float example

---
 benchmarks/CMakeLists.txt                     |   2 +
 ...ere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp |   2 +-
 examples/sycl/common/example_runner.hpp       | 387 ------------------
 3 files changed, 3 insertions(+), 388 deletions(-)
 delete mode 100644 examples/sycl/common/example_runner.hpp

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index d499f839a9..446f8fd8bb 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -66,6 +66,8 @@ endfunction()
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
+else(SYCL_NVIDIA_TARGET)
+  add_subdirectory(ampere)
 endif()
 if (SYCL_NVIDIA_TARGET)
   add_subdirectory(ampere)
diff --git a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
index 69b6159f76..69bc482f12 100644
--- a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
+++ b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
@@ -53,7 +53,7 @@ int main(int argc, const char** argv)
   }
 
   //
-  // Run Benchmark 
+  // Run Benchmark
   //
 
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp
deleted file mode 100644
index f300012caa..0000000000
--- a/examples/sycl/common/example_runner.hpp
+++ /dev/null
@@ -1,387 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/gemm/device/gemm_universal.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/collective/collective_mma.hpp"
-#include "cutlass/util/GPU_Clock.hpp"
-
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_copy.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cute/tensor.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/reference/device/gemm_complex.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/print_error.hpp"
-
-template <typename T>
-static void fill_matrix(std::vector<T> &M)
-{
-  std::generate(std::begin(M), std::end(M), [&]
-  { return static_cast<T>( 2*(rand() / double(RAND_MAX)) - 1 ); });
-}
-
-using namespace cute;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Command line options parsing
-struct Options {
-
-    bool help;
-    bool error;
-
-    int m, n, k, l, iterations;
-    float alpha, beta;
-
-    Options():
-            help(false),
-            error(false),
-            m(4096), n(4096), k(4096), l(1), iterations(100),
-            alpha(1.f), beta(0.f)
-    { }
-
-    // Parses the command line
-    void parse(int argc, char const **args) {
-      cutlass::CommandLine cmd(argc, args);
-
-      if (cmd.check_cmd_line_flag("help")) {
-        help = true;
-        return;
-      }
-
-      cmd.get_cmd_line_argument("m", m, 4096);
-      cmd.get_cmd_line_argument("n", n, 4096);
-      cmd.get_cmd_line_argument("k", k, 4096);
-      cmd.get_cmd_line_argument("l", l, 1);
-      cmd.get_cmd_line_argument("alpha", alpha, 1.f);
-      cmd.get_cmd_line_argument("beta", beta, 0.f);
-      cmd.get_cmd_line_argument("iterations", iterations, 100);
-    }
-
-    /// Prints the usage statement.
-    std::ostream & print_usage(std::ostream &out) const {
-
-      out << "PVC GEMM Example\n\n"
-          << "Options:\n\n"
-          << "  --help                      If specified, displays this usage statement\n\n"
-          << "  --m=<int>                   Sets the M extent of the GEMM\n"
-          << "  --n=<int>                   Sets the N extent of the GEMM\n"
-          << "  --k=<int>                   Sets the K extent of the GEMM\n"
-          << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
-          << "  --alpha=<s32>               Epilogue scalar alpha\n"
-          << "  --beta=<s32>                Epilogue scalar beta\n\n"
-          << "  --iterations=<int>          Iterations\n\n";
-
-      return out;
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class Gemm>
-struct ExampleRunner {
-
-    using StrideA = typename Gemm::GemmKernel::StrideA;
-    using StrideB = typename Gemm::GemmKernel::StrideB;
-    using StrideC = typename Gemm::GemmKernel::StrideC;
-    using StrideD = typename Gemm::GemmKernel::StrideD;
-
-    using LayoutA = typename Gemm::LayoutA;
-    using LayoutB = typename Gemm::LayoutB;
-    using LayoutC = typename Gemm::LayoutC;
-    using LayoutD = typename Gemm::LayoutD;
-
-    using ElementA = typename Gemm::ElementA;
-    using ElementB = typename Gemm::ElementB;
-    using ElementAcc = typename Gemm::ElementAccumulator;
-
-    using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
-    using ElementC = typename Gemm::ElementC;
-    using ElementOutput = typename CollectiveEpilogue::ElementOutput;
-    using ElementCompute = typename CollectiveEpilogue::ElementCompute;
-    using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
-
-    using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
-
-    //
-    // Data members
-    //
-
-    /// Initialization
-    StrideA stride_A;
-    StrideB stride_B;
-    StrideC stride_C;
-    StrideD stride_D;
-
-    cutlass::DeviceAllocation<ElementA> block_A;
-    cutlass::DeviceAllocation<ElementB> block_B;
-    cutlass::DeviceAllocation<ElementC> block_C;
-    cutlass::DeviceAllocation<ElementOutput> block_D;
-    cutlass::DeviceAllocation<ElementOutput> block_ref_D;
-
-    //
-    // Methods
-    //
-
-    bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
-      auto [M, N, K, L] = problem_size;
-
-      cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
-      cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
-      cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
-      cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
-
-      cutlass::reference::device::GemmComplex(
-              {M, N, K},
-              alpha,
-              ref_A,
-              cutlass::ComplexTransform::kNone,
-              ref_B,
-              cutlass::ComplexTransform::kNone,
-              beta,
-              ref_C,
-              ref_D,
-              ElementAccumulator(0),
-              L,     // batch_count
-              M * K, // batch_stride_A
-              K * N, // batch_stride_B
-              M * N, // batch_stride_C
-              M * N  // batch_stride_D
-      );
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
-      // need to set a larger error margin for comparison to succeed
-      auto epsilon = static_cast<ElementOutput>(0.1f);
-      auto nonzero_floor = static_cast<ElementOutput>(0.1f);
-
-      bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
-              block_ref_D.get(), block_D.get(), block_D.size(),
-              epsilon, nonzero_floor);
-
-      return passed;
-    }
-
-    /// Initialize operands to be used in the GEMM and reference GEMM
-    virtual void initialize(const ProblemShapeType& problem_size) {
-      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-      stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
-      stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
-      stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
-
-      block_A.reset(M * K * L);
-      block_B.reset(K * N * L);
-      block_C.reset(M * N * L);
-      block_D.reset(M * N * L);
-      block_ref_D.reset(M * N * L);
-
-      // TODO: Enable initialization on device directly once RNG is
-      // available through SYCL.
-      std::vector<ElementA> a(K * M * L);
-      std::vector<ElementB> b(K * N * L);
-      std::vector<ElementC> c(M * N * L);
-      std::vector<ElementC> d(M * N * L, ElementC{0});
-
-      fill_matrix(a);
-      fill_matrix(b);
-      fill_matrix(c);
-
-      block_A.copy_from_host(a.data(), a.size());
-      block_B.copy_from_host(b.data(), b.size());
-      block_C.copy_from_host(c.data(), c.size());
-      block_D.copy_from_host(d.data(), d.size());
-      block_ref_D.copy_from_host(d.data(), d.size());
-    }
-
-    virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
-      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
-
-      initialize(problem_size);
-
-      typename Gemm::GemmKernel::Arguments arguments{
-              cutlass::gemm::GemmUniversalMode::kGemm,
-              problem_size,
-              {block_A.get(), stride_A, block_B.get(), stride_B},
-              {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
-              hw_info
-      };
-
-      Gemm gemm_op;
-
-      size_t workspace_size = Gemm::get_workspace_size(arguments);
-      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-      gemm_op.can_implement(arguments);
-
-      gemm_op.initialize(arguments, workspace.get());
-
-      // Run the GEMM
-      gemm_op.run();
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Verify that the result is correct
-      bool passed = verify(problem_size, options.alpha, options.beta);
-      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-      if (passed && options.iterations > 0) {
-        GPU_Clock timer;
-        timer.start();
-        for (int i = 0; i < options.iterations; ++i) {
-          gemm_op.run();
-        }
-
-        float cute_time = timer.seconds() / options.iterations;
-        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
-        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
-                  << std::endl;
-        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time * 1000);
-      }
-    }
-};
-
-template <class Gemm>
-struct PvcExampleRunner : ExampleRunner<Gemm> {
-    using Base = ExampleRunner<Gemm>;
-
-    using ElementB = typename Base::ElementB;
-
-    using ProblemShapeType = typename Base::ProblemShapeType;
-
-    cutlass::DeviceAllocation<ElementB> block_B_vnni;
-
-    template <typename T>
-    void vnni_matrix(
-            T* dst, const T* src,
-            int batch, int numRows, int numCols, int factor)
-    {
-      for (int b = 0; b < batch; b++) {
-        for (int r = 0; r < numRows / factor; r++) {
-          for (int c = 0; c < numCols; c++) {
-            for (int k = 0; k < factor; k++) {
-              dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
-                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
-            }
-          }
-        }
-      }
-    }
-
-    void initialize(const ProblemShapeType& problem_size) override {
-      Base::initialize(problem_size);
-
-      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      block_B_vnni.reset(Base::block_B.size());
-
-      std::vector<ElementB> b(K * N * L);
-      std::vector<ElementB> b_vnni(b.size());
-
-      Base::block_B.copy_to_host(b.data());
-      vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
-
-      block_B_vnni.copy_from_host(b_vnni.data());
-    }
-
-    void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override {
-      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
-
-      initialize(problem_size);
-
-      typename Gemm::GemmKernel::Arguments arguments{
-              cutlass::gemm::GemmUniversalMode::kGemm,
-              problem_size,
-              {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B},
-              {
-                {options.alpha, options.beta},
-                Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D
-              },
-              hw_info
-      };
-
-      Gemm gemm_op;
-
-      size_t workspace_size = Gemm::get_workspace_size(arguments);
-      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-      gemm_op.can_implement(arguments);
-
-      gemm_op.initialize(arguments, workspace.get());
-
-      // Run the GEMM
-      gemm_op.run();
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Verify that the result is correct
-      bool passed = Base::verify(problem_size, options.alpha, options.beta);
-      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-      if (passed && options.iterations > 0) {
-        GPU_Clock timer;
-        timer.start();
-        for (int i = 0; i < options.iterations; ++i) {
-          gemm_op.run();
-        }
-
-        float cute_time = timer.seconds() / options.iterations;
-        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
-        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
-        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
-      }
-    }
-};
-

From cf4a32a809c8737a90d98b892f435bfba16bd51d Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Wed, 29 May 2024 13:35:05 +0100
Subject: [PATCH 06/10] Update benchmarks/CMakeLists.txt

Co-authored-by: Mehdi Goli <mehdi.goli@codeplay.com>
---
 benchmarks/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 446f8fd8bb..8b3aed1b82 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -66,7 +66,8 @@ endfunction()
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
+endif()
+if (SYCL_NVIDIA_TARGET)
   add_subdirectory(ampere)
 endif()
 if (SYCL_NVIDIA_TARGET)

From 69d066a5e3821e1602ab9632f68b62fac1cc1fdf Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 14:23:37 +0100
Subject: [PATCH 07/10] Add Ampere half-float example

---
 benchmarks/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 8b3aed1b82..79d88583fa 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -66,6 +66,8 @@ endfunction()
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
+else(SYCL_NVIDIA_TARGET)
+  add_subdirectory(ampere)
 endif()
 if (SYCL_NVIDIA_TARGET)
   add_subdirectory(ampere)

From 1d190bc056367b15ba1b43f3348093aeddc88e25 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 14:23:37 +0100
Subject: [PATCH 08/10] Add Ampere half-float example

---
 examples/sycl/CMakeLists.txt                  |   2 +
 examples/sycl/ampere/CMakeLists.txt           |  33 ++++
 ...ere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp | 153 ++++++++++++++++++
 examples/sycl/ampere/gemm_configuration.hpp   | 122 ++++++++++++++
 4 files changed, 310 insertions(+)
 create mode 100644 examples/sycl/ampere/CMakeLists.txt
 create mode 100644 examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
 create mode 100644 examples/sycl/ampere/gemm_configuration.hpp

diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index b736ce35e8..8dac4f6c55 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -29,4 +29,6 @@
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
+else(SYCL_NVIDIA_TARGET)
+  add_subdirectory(ampere)
 endif()
diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt
new file mode 100644
index 0000000000..d25c28fc1d
--- /dev/null
+++ b/examples/sycl/ampere/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
+  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
+)
diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
new file mode 100644
index 0000000000..a7e212fe80
--- /dev/null
+++ b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
@@ -0,0 +1,153 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../common/example_runner.hpp"
+#include "gemm_configuration.hpp"
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128, _128, _32>;
+
+  using TiledMma = TiledMMA<
+          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+          Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
+          Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
+
+  static constexpr int kAlignmentA = 8;
+  using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA<
+          ElementInputA, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  static constexpr int kAlignmentB = 8;
+  using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB<
+          ElementInputB, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  using Stages = Int<3>;
+
+  // This code section describes the epilogue part of the kernel
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+          // memory access. For a byte, it's 16
+          // elements. This becomes the vector width of
+          // math instructions in the epilogue too
+          ElementAccumulator,                                // <- data type of accumulator
+          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<Stages{}>;
+
+  // Define strides (mixed)
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+  using StrideC = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          StrideC,
+          StrideD,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+  // Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementInputA,
+          StrideA,
+          ElementInputB,
+          StrideB,
+          TiledMma,
+          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+          Shape<int, int, int, int>,
+          CollectiveMainloop,
+          CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/examples/sycl/ampere/gemm_configuration.hpp b/examples/sycl/ampere/gemm_configuration.hpp
new file mode 100644
index 0000000000..8a32e77e7f
--- /dev/null
+++ b/examples/sycl/ampere/gemm_configuration.hpp
@@ -0,0 +1,122 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/half.h"
+#include "cutlass/layout/layout.h"
+
+#include "cute/swizzle.hpp"
+#include "cute/layout.hpp"
+#include "cute/arch/copy_sm75.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/atom/copy_atom.hpp"
+
+using namespace cute;
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA;
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB;
+
+/////////////////////////////////////////////////////////////////////////
+
+// half
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMajor, 8, 64>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                Stride<_64, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16,_8>,
+                    Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8, SizeK>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                Stride< _1,_64>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16, _8>,
+                    Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 32>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                Stride<_32, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_32,_4>,
+                    Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+{};

From 3b8108c56b7c029eea72eba23b1938c038626bc8 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 14:23:37 +0100
Subject: [PATCH 09/10] Add Ampere half-float example

---
 ...pere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu | 155 ++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu

diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
new file mode 100644
index 0000000000..67aabf76d3
--- /dev/null
+++ b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
@@ -0,0 +1,155 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../common/example_runner.hpp"
+#include "gemm_configuration.hpp"
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128, _128, _32>;
+
+  using TiledMma = TiledMMA<
+          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+          Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
+          Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
+
+  static constexpr int kAlignmentA = 8;
+  using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA<
+          ElementInputA, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  static constexpr int kAlignmentB = 8;
+  using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB<
+          ElementInputB, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  using Stages = Int<3>;
+
+  // This code section describes the epilogue part of the kernel
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+          // memory access. For a byte, it's 16
+          // elements. This becomes the vector width of
+          // math instructions in the epilogue too
+          ElementAccumulator,                                // <- data type of accumulator
+          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<Stages{}>;
+
+  // Define strides (mixed)
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+  using StrideC = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          StrideC,
+          StrideD,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+  // Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementInputA,
+          StrideA,
+          ElementInputB,
+          StrideB,
+          TiledMma,
+          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+          Shape<int, int, int, int>,
+          CollectiveMainloop,
+          CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}

From 74cbb6d74dbf00f5464dfc9d5c2711d946ec7fd4 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 20 May 2024 14:29:48 +0100
Subject: [PATCH 10/10] Add Ampere bfloat-float example

---
 benchmarks/CMakeLists.txt                     |   5 -
 benchmarks/ampere/CMakeLists.txt              |   5 +
 ...ere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp |  14 +-
 benchmarks/ampere/gemm_configuration.hpp      |  87 +++++++++-
 benchmarks/common/benchmark_runner.hpp        |   2 +-
 ...ench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp |   2 +-
 examples/sycl/CMakeLists.txt                  |   2 -
 examples/sycl/ampere/CMakeLists.txt           |  33 ----
 ...pere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu | 155 ------------------
 examples/sycl/ampere/gemm_configuration.hpp   | 122 --------------
 10 files changed, 95 insertions(+), 332 deletions(-)
 rename examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp => benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp (93%)
 delete mode 100644 examples/sycl/ampere/CMakeLists.txt
 delete mode 100644 examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
 delete mode 100644 examples/sycl/ampere/gemm_configuration.hpp

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 79d88583fa..d499f839a9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -66,11 +66,6 @@ endfunction()
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
-endif()
-if (SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
 endif()
 if (SYCL_NVIDIA_TARGET)
   add_subdirectory(ampere)
diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt
index 70c2bdc990..666d9cac60 100644
--- a/benchmarks/ampere/CMakeLists.txt
+++ b/benchmarks/ampere/CMakeLists.txt
@@ -31,3 +31,8 @@ cutlass_benchmark_add_executable(
   bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
   bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
 )
+
+cutlass_benchmark_add_executable(
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
+)
diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
similarity index 93%
rename from examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
rename to benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
index a7e212fe80..8dad127417 100644
--- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
+++ b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
@@ -29,7 +29,7 @@
  *
  **************************************************************************************************/
 
-#include "../common/example_runner.hpp"
+#include "../common/benchmark_runner.hpp"
 #include "gemm_configuration.hpp"
 
 int main(int argc, const char** argv)
@@ -53,7 +53,7 @@ int main(int argc, const char** argv)
   }
 
   //
-  // Run examples
+  // Run benchmark
   //
 
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
@@ -68,19 +68,19 @@ int main(int argc, const char** argv)
 // elements in input matrices.
   using ElementAccumulator = float;                   // <- data type of accumulator
   using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
-  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
   using ElementOutput = float;                        // <- data type of elements in output matrix D
 
   using LayoutA = cutlass::layout::ColumnMajor;
-  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
   using LayoutC = cutlass::layout::ColumnMajor;
   using LayoutD = cutlass::layout::ColumnMajor;
 
   using TileShape = Shape<_128, _128, _32>;
 
   using TiledMma = TiledMMA<
-          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+          MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>,
           Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
           Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
 
@@ -145,7 +145,7 @@ int main(int argc, const char** argv)
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
-  ExampleRunner<Gemm> runner;
+  BenchmarkRunner<Gemm> runner;
 
   runner.run(options, hw_info);
 
diff --git a/benchmarks/ampere/gemm_configuration.hpp b/benchmarks/ampere/gemm_configuration.hpp
index 8a32e77e7f..484786567f 100644
--- a/benchmarks/ampere/gemm_configuration.hpp
+++ b/benchmarks/ampere/gemm_configuration.hpp
@@ -58,14 +58,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMa
     using SmemLayoutAtom = decltype(
     composition(Swizzle<3,3,3>{},
                 Layout<Shape < _8,_64>,
-                Stride<_64, _1>>{}));
+                        Stride<_64, _1>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_16,_8>,
-                    Stride< _8,_1>>{},
+                            Stride< _8,_1>>{},
                     Layout<Shape < _1,_8>>{}));
 };
 
@@ -77,14 +77,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8
     using SmemLayoutAtom = decltype(
     composition(Swizzle<3,3,3>{},
                 Layout<Shape <_64, _8>,
-                Stride< _1,_64>>{}));
+                        Stride< _1,_64>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_16, _8>,
-                    Stride< _1,_16>>{},
+                            Stride< _1,_16>>{},
                     Layout<Shape < _8, _1>>{}));
 };
 
@@ -96,14 +96,14 @@ struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 3
     using SmemLayoutAtom = decltype(
     composition(Swizzle<2,3,3>{},
                 Layout<Shape < _8,_32>,
-                Stride<_32, _1>>{}));
+                        Stride<_32, _1>>{}));
     using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
 
     // Gmem
     using GmemTiledCopy = decltype(
     make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
                     Layout<Shape <_32,_4>,
-                    Stride< _4,_1>>{},
+                            Stride< _4,_1>>{},
                     Layout<Shape < _1,_8>>{}));
 };
 
@@ -120,3 +120,78 @@ template <int Alignment, int SizeK>
 struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
         : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
 {};
+
+/////////////////////////////////////////////////////////////////////////
+
+// Bfloat
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<cutlass::bfloat16_t, cutlass::layout::RowMajor, 8, 64>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                        Stride<_64, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16,_8>,
+                            Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, 8, SizeK>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                        Stride< _1,_64>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16, _8>,
+                            Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, 8, 32>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                        Stride<_32, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_32,_4>,
+                            Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+{};
diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp
index e3d5d8f3a5..5eb2ade3eb 100644
--- a/benchmarks/common/benchmark_runner.hpp
+++ b/benchmarks/common/benchmark_runner.hpp
@@ -97,7 +97,7 @@ struct Options {
     /// Prints the usage statement.
     std::ostream & print_usage(std::ostream &out) const {
 
-      out << "PVC GEMM Example\n\n"
+      out << "PVC GEMM Benchmark\n\n"
           << "Options:\n\n"
           << "  --help                      If specified, displays this usage statement\n\n"
           << "  --m=<int>                   Sets the M extent of the GEMM\n"
diff --git a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
index 2ff30d7a79..67b76929db 100644
--- a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
+++ b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
@@ -56,7 +56,7 @@ int main(int argc, const char** argv)
   }
 
   //
-  // Run examples
+  // Run benchmark
   //
 
   // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index 8dac4f6c55..b736ce35e8 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -29,6 +29,4 @@
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
 endif()
diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt
deleted file mode 100644
index d25c28fc1d..0000000000
--- a/examples/sycl/ampere/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-cutlass_example_add_executable(
-  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
-  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
-)
diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
deleted file mode 100644
index 67aabf76d3..0000000000
--- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "../common/example_runner.hpp"
-#include "gemm_configuration.hpp"
-
-int main(int argc, const char** argv)
-{
-  //
-  // Parse options
-  //
-
-  Options options;
-
-  options.parse(argc, argv);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  if (options.error) {
-    std::cerr << "Aborting execution." << std::endl;
-    return -1;
-  }
-
-  //
-  // Run examples
-  //
-
-  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
-  // information is used by the underlying kernel.
-  cutlass::KernelHardwareInfo hw_info;
-
-  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
-  // to use a GPU other than that with device ID 0.
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-
-  bool passed;
-
-// The code section below describes datatype for input, output matrices and computation between
-// elements in input matrices.
-  using ElementAccumulator = float;                   // <- data type of accumulator
-  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
-  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
-  using ElementOutput = float;                        // <- data type of elements in output matrix D
-
-  using LayoutA = cutlass::layout::ColumnMajor;
-  using LayoutB = cutlass::layout::RowMajor;
-  using LayoutC = cutlass::layout::ColumnMajor;
-  using LayoutD = cutlass::layout::ColumnMajor;
-
-  using TileShape = Shape<_128, _128, _32>;
-
-  using TiledMma = TiledMMA<
-          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
-          Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
-          Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
-
-  static constexpr int kAlignmentA = 8;
-  using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA<
-          ElementInputA, LayoutA, kAlignmentA, 32>;
-  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
-  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
-  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
-
-  static constexpr int kAlignmentB = 8;
-  using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB<
-          ElementInputB, LayoutB, kAlignmentB, 32>;
-  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
-  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
-  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
-
-  using Stages = Int<3>;
-
-  // This code section describes the epilogue part of the kernel
-  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
-          ElementOutput,                                     // <- data type of output matrix
-          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
-          // memory access. For a byte, it's 16
-          // elements. This becomes the vector width of
-          // math instructions in the epilogue too
-          ElementAccumulator,                                // <- data type of accumulator
-          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
-
-  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<Stages{}>;
-
-  // Define strides (mixed)
-  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
-  using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
-  using StrideC = cutlass::detail::TagToStrideC_t<LayoutC>;
-  using StrideD = cutlass::detail::TagToStrideC_t<LayoutD>;
-
-  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
-          StrideC,
-          StrideD,
-          EpilogueOp,
-          cutlass::gemm::EpilogueDefault>;
-
-  // Mainloop
-  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
-          DispatchPolicy,
-          TileShape,
-          ElementInputA,
-          StrideA,
-          ElementInputB,
-          StrideB,
-          TiledMma,
-          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
-          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
-  >;
-
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-          Shape<int, int, int, int>,
-          CollectiveMainloop,
-          CollectiveEpilogue
-  >;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  ExampleRunner<Gemm> runner;
-
-  runner.run(options, hw_info);
-
-  return 0;
-}
diff --git a/examples/sycl/ampere/gemm_configuration.hpp b/examples/sycl/ampere/gemm_configuration.hpp
deleted file mode 100644
index 8a32e77e7f..0000000000
--- a/examples/sycl/ampere/gemm_configuration.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass/half.h"
-#include "cutlass/layout/layout.h"
-
-#include "cute/swizzle.hpp"
-#include "cute/layout.hpp"
-#include "cute/arch/copy_sm75.hpp"
-#include "cute/arch/copy_sm80.hpp"
-#include "cute/atom/copy_atom.hpp"
-
-using namespace cute;
-
-template <typename Element, typename Layout, int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandA;
-
-template <typename Element, typename Layout, int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB;
-
-/////////////////////////////////////////////////////////////////////////
-
-// half
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMajor, 8, 64>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape < _8,_64>,
-                Stride<_64, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_16,_8>,
-                    Stride< _8,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-/// Operand A - Column-major (M-major)
-template <int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8, SizeK>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape <_64, _8>,
-                Stride< _1,_64>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_16, _8>,
-                    Stride< _1,_16>>{},
-                    Layout<Shape < _8, _1>>{}));
-};
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 32>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<2,3,3>{},
-                Layout<Shape < _8,_32>,
-                Stride<_32, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_32,_4>,
-                    Stride< _4,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
-
-// Operand B - Column-Major (K-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
-{};
-
-// Operand B - Row-Major (N-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-{};