diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 395af2dc2c..0831067283 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -43,20 +43,22 @@ function(cutlass_benchmark_add_executable NAME)
 
   add_dependencies(cutlass_benchmarks ${NAME})
 
+  if (NOT CUTLASS_ENABLE_SYCL)
+    SET(ADD_CUDA ON)
+  endif()
+
   target_link_libraries(
     ${NAME}
     PRIVATE
     CUTLASS
     cutlass_tools_util_includes
-    )
-
-  target_include_directories(
-    ${NAME}
-    PRIVATE
-    ${CUTLASS_BENCHMARKS_COMMON_SOURCE_DIR}
-    )
+    $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
+    $<$<BOOL:${ADD_CUDA}>:cuda>
+  )
 
-  add_sycl_to_target(TARGET ${NAME})
+  if (CUTLASS_ENABLE_SYCL)
+    add_sycl_to_target(TARGET ${NAME})
+  endif()
 
   install(
     TARGETS ${NAME}
@@ -66,6 +68,6 @@ endfunction()
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
+else(SYCL_NVIDIA_TARGET OR NOT CUTLASS_ENABLE_SYCL)
   add_subdirectory(ampere)
 endif()
diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt
index 666d9cac60..a77901594b 100644
--- a/benchmarks/ampere/CMakeLists.txt
+++ b/benchmarks/ampere/CMakeLists.txt
@@ -29,10 +29,10 @@
 
 cutlass_benchmark_add_executable(
   bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
-  bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
+  bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
 )
 
 cutlass_benchmark_add_executable(
   bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32
-  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu
 )
diff --git a/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu
similarity index 100%
rename from benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
rename to benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu
diff --git a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
similarity index 100%
rename from benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
rename to benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index 8dac4f6c55..b736ce35e8 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -29,6 +29,4 @@
 
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
-else(SYCL_NVIDIA_TARGET)
-  add_subdirectory(ampere)
 endif()
diff --git a/examples/sycl/ampere/CMakeLists.txt b/examples/sycl/ampere/CMakeLists.txt
deleted file mode 100644
index 81c8545ec7..0000000000
--- a/examples/sycl/ampere/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-cutlass_example_add_executable(
-  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
-  ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
-)
-
-cutlass_example_add_executable(
-  ampere_gemm_bf16_bf16_fp32_tensor_op_fp32
-  ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
-)
diff --git a/examples/sycl/ampere/ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp b/examples/sycl/ampere/ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cpp
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp b/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
deleted file mode 100644
index a7e212fe80..0000000000
--- a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "../common/example_runner.hpp"
-#include "gemm_configuration.hpp"
-
-int main(int argc, const char** argv)
-{
-  //
-  // Parse options
-  //
-
-  Options options;
-
-  options.parse(argc, argv);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  if (options.error) {
-    std::cerr << "Aborting execution." << std::endl;
-    return -1;
-  }
-
-  //
-  // Run examples
-  //
-
-  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
-  // information is used by the underlying kernel.
-  cutlass::KernelHardwareInfo hw_info;
-
-  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
-  // to use a GPU other than that with device ID 0.
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-
-// The code section below describes datatype for input, output matrices and computation between
-// elements in input matrices.
-  using ElementAccumulator = float;                   // <- data type of accumulator
-  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
-  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
-  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
-  using ElementOutput = float;                        // <- data type of elements in output matrix D
-
-  using LayoutA = cutlass::layout::ColumnMajor;
-  using LayoutB = cutlass::layout::RowMajor;
-  using LayoutC = cutlass::layout::ColumnMajor;
-  using LayoutD = cutlass::layout::ColumnMajor;
-
-  using TileShape = Shape<_128, _128, _32>;
-
-  using TiledMma = TiledMMA<
-          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
-          Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
-          Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
-
-  static constexpr int kAlignmentA = 8;
-  using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA<
-          ElementInputA, LayoutA, kAlignmentA, 32>;
-  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
-  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
-  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
-
-  static constexpr int kAlignmentB = 8;
-  using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB<
-          ElementInputB, LayoutB, kAlignmentB, 32>;
-  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
-  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
-  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
-
-  using Stages = Int<3>;
-
-  // This code section describes the epilogue part of the kernel
-  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
-          ElementOutput,                                     // <- data type of output matrix
-          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
-          // memory access. For a byte, it's 16
-          // elements. This becomes the vector width of
-          // math instructions in the epilogue too
-          ElementAccumulator,                                // <- data type of accumulator
-          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
-
-  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<Stages{}>;
-
-  // Define strides (mixed)
-  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
-  using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
-  using StrideC = cutlass::detail::TagToStrideC_t<LayoutC>;
-  using StrideD = cutlass::detail::TagToStrideC_t<LayoutD>;
-
-  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
-          StrideC,
-          StrideD,
-          EpilogueOp,
-          cutlass::gemm::EpilogueDefault>;
-
-  // Mainloop
-  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
-          DispatchPolicy,
-          TileShape,
-          ElementInputA,
-          StrideA,
-          ElementInputB,
-          StrideB,
-          TiledMma,
-          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
-          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
-  >;
-
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-          Shape<int, int, int, int>,
-          CollectiveMainloop,
-          CollectiveEpilogue
-  >;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  ExampleRunner<Gemm> runner;
-
-  runner.run(options, hw_info);
-
-  return 0;
-}
diff --git a/examples/sycl/ampere/gemm_configuration.hpp b/examples/sycl/ampere/gemm_configuration.hpp
deleted file mode 100644
index d0740af472..0000000000
--- a/examples/sycl/ampere/gemm_configuration.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass/half.h"
-#include "cutlass/layout/layout.h"
-
-#include "cute/swizzle.hpp"
-#include "cute/layout.hpp"
-#include "cute/arch/copy_sm75.hpp"
-#include "cute/arch/copy_sm80.hpp"
-#include "cute/atom/copy_atom.hpp"
-
-using namespace cute;
-
-template <typename Element, typename Layout, int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandA;
-
-template <typename Element, typename Layout, int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB;
-
-/////////////////////////////////////////////////////////////////////////
-
-// half
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMajor, 8, 64>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape < _8,_64>,
-                Stride<_64, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_16,_8>,
-                    Stride< _8,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-/// Operand A - Column-major (M-major)
-template <int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8, SizeK>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape <_64, _8>,
-                Stride< _1,_64>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_16, _8>,
-                    Stride< _1,_16>>{},
-                    Layout<Shape < _8, _1>>{}));
-};
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 32>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<2,3,3>{},
-                Layout<Shape < _8,_32>,
-                Stride<_32, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
-                    Layout<Shape <_32,_4>,
-                    Stride< _4,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
-
-// Operand B - Column-Major (K-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
-{};
-
-// Operand B - Row-Major (N-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-{};
-
-/////////////////////////////////////////////////////////////////////////
-
-// Bfloat
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<cutlass::bfloat16_t, cutlass::layout::RowMajor, 8, 64>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape < _8,_64>,
-                        Stride<_64, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
-                    Layout<Shape <_16,_8>,
-                            Stride< _8,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-/// Operand A - Column-major (M-major)
-template <int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, 8, SizeK>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<3,3,3>{},
-                Layout<Shape <_64, _8>,
-                        Stride< _1,_64>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, bfloat16_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
-                    Layout<Shape <_16, _8>,
-                            Stride< _1,_16>>{},
-                    Layout<Shape < _8, _1>>{}));
-};
-
-/// Operand A - Row-major (K-Major)
-template <>
-struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, 8, 32>
-{
-    // Smem
-    using SmemLayoutAtom = decltype(
-    composition(Swizzle<2,3,3>{},
-                Layout<Shape < _8,_32>,
-                        Stride<_32, _1>>{}));
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
-
-    // Gmem
-    using GmemTiledCopy = decltype(
-    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
-                    Layout<Shape <_32,_4>,
-                            Stride< _4,_1>>{},
-                    Layout<Shape < _1,_8>>{}));
-};
-
-// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
-
-// Operand B - Column-Major (K-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
-{};
-
-// Operand B - Row-Major (N-major)
-template <int Alignment, int SizeK>
-struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
-        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
-{};
diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp
deleted file mode 100644
index f300012caa..0000000000
--- a/examples/sycl/common/example_runner.hpp
+++ /dev/null
@@ -1,387 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/gemm/device/gemm_universal.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/collective/collective_mma.hpp"
-#include "cutlass/util/GPU_Clock.hpp"
-
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_copy.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cute/tensor.hpp"
-
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/packed_stride.hpp"
-#include "cutlass/util/reference/device/gemm_complex.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cutlass/util/print_error.hpp"
-
-template <typename T>
-static void fill_matrix(std::vector<T> &M)
-{
-  std::generate(std::begin(M), std::end(M), [&]
-  { return static_cast<T>( 2*(rand() / double(RAND_MAX)) - 1 ); });
-}
-
-using namespace cute;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Command line options parsing
-struct Options {
-
-    bool help;
-    bool error;
-
-    int m, n, k, l, iterations;
-    float alpha, beta;
-
-    Options():
-            help(false),
-            error(false),
-            m(4096), n(4096), k(4096), l(1), iterations(100),
-            alpha(1.f), beta(0.f)
-    { }
-
-    // Parses the command line
-    void parse(int argc, char const **args) {
-      cutlass::CommandLine cmd(argc, args);
-
-      if (cmd.check_cmd_line_flag("help")) {
-        help = true;
-        return;
-      }
-
-      cmd.get_cmd_line_argument("m", m, 4096);
-      cmd.get_cmd_line_argument("n", n, 4096);
-      cmd.get_cmd_line_argument("k", k, 4096);
-      cmd.get_cmd_line_argument("l", l, 1);
-      cmd.get_cmd_line_argument("alpha", alpha, 1.f);
-      cmd.get_cmd_line_argument("beta", beta, 0.f);
-      cmd.get_cmd_line_argument("iterations", iterations, 100);
-    }
-
-    /// Prints the usage statement.
-    std::ostream & print_usage(std::ostream &out) const {
-
-      out << "PVC GEMM Example\n\n"
-          << "Options:\n\n"
-          << "  --help                      If specified, displays this usage statement\n\n"
-          << "  --m=<int>                   Sets the M extent of the GEMM\n"
-          << "  --n=<int>                   Sets the N extent of the GEMM\n"
-          << "  --k=<int>                   Sets the K extent of the GEMM\n"
-          << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
-          << "  --alpha=<s32>               Epilogue scalar alpha\n"
-          << "  --beta=<s32>                Epilogue scalar beta\n\n"
-          << "  --iterations=<int>          Iterations\n\n";
-
-      return out;
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class Gemm>
-struct ExampleRunner {
-
-    using StrideA = typename Gemm::GemmKernel::StrideA;
-    using StrideB = typename Gemm::GemmKernel::StrideB;
-    using StrideC = typename Gemm::GemmKernel::StrideC;
-    using StrideD = typename Gemm::GemmKernel::StrideD;
-
-    using LayoutA = typename Gemm::LayoutA;
-    using LayoutB = typename Gemm::LayoutB;
-    using LayoutC = typename Gemm::LayoutC;
-    using LayoutD = typename Gemm::LayoutD;
-
-    using ElementA = typename Gemm::ElementA;
-    using ElementB = typename Gemm::ElementB;
-    using ElementAcc = typename Gemm::ElementAccumulator;
-
-    using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
-    using ElementC = typename Gemm::ElementC;
-    using ElementOutput = typename CollectiveEpilogue::ElementOutput;
-    using ElementCompute = typename CollectiveEpilogue::ElementCompute;
-    using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
-
-    using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
-
-    //
-    // Data members
-    //
-
-    /// Initialization
-    StrideA stride_A;
-    StrideB stride_B;
-    StrideC stride_C;
-    StrideD stride_D;
-
-    cutlass::DeviceAllocation<ElementA> block_A;
-    cutlass::DeviceAllocation<ElementB> block_B;
-    cutlass::DeviceAllocation<ElementC> block_C;
-    cutlass::DeviceAllocation<ElementOutput> block_D;
-    cutlass::DeviceAllocation<ElementOutput> block_ref_D;
-
-    //
-    // Methods
-    //
-
-    bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
-      auto [M, N, K, L] = problem_size;
-
-      cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
-      cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
-      cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
-      cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
-
-      cutlass::reference::device::GemmComplex(
-              {M, N, K},
-              alpha,
-              ref_A,
-              cutlass::ComplexTransform::kNone,
-              ref_B,
-              cutlass::ComplexTransform::kNone,
-              beta,
-              ref_C,
-              ref_D,
-              ElementAccumulator(0),
-              L,     // batch_count
-              M * K, // batch_stride_A
-              K * N, // batch_stride_B
-              M * N, // batch_stride_C
-              M * N  // batch_stride_D
-      );
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
-      // need to set a larger error margin for comparison to succeed
-      auto epsilon = static_cast<ElementOutput>(0.1f);
-      auto nonzero_floor = static_cast<ElementOutput>(0.1f);
-
-      bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
-              block_ref_D.get(), block_D.get(), block_D.size(),
-              epsilon, nonzero_floor);
-
-      return passed;
-    }
-
-    /// Initialize operands to be used in the GEMM and reference GEMM
-    virtual void initialize(const ProblemShapeType& problem_size) {
-      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-      stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
-      stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
-      stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
-
-      block_A.reset(M * K * L);
-      block_B.reset(K * N * L);
-      block_C.reset(M * N * L);
-      block_D.reset(M * N * L);
-      block_ref_D.reset(M * N * L);
-
-      // TODO: Enable initialization on device directly once RNG is
-      // available through SYCL.
-      std::vector<ElementA> a(K * M * L);
-      std::vector<ElementB> b(K * N * L);
-      std::vector<ElementC> c(M * N * L);
-      std::vector<ElementC> d(M * N * L, ElementC{0});
-
-      fill_matrix(a);
-      fill_matrix(b);
-      fill_matrix(c);
-
-      block_A.copy_from_host(a.data(), a.size());
-      block_B.copy_from_host(b.data(), b.size());
-      block_C.copy_from_host(c.data(), c.size());
-      block_D.copy_from_host(d.data(), d.size());
-      block_ref_D.copy_from_host(d.data(), d.size());
-    }
-
-    virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
-      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
-
-      initialize(problem_size);
-
-      typename Gemm::GemmKernel::Arguments arguments{
-              cutlass::gemm::GemmUniversalMode::kGemm,
-              problem_size,
-              {block_A.get(), stride_A, block_B.get(), stride_B},
-              {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
-              hw_info
-      };
-
-      Gemm gemm_op;
-
-      size_t workspace_size = Gemm::get_workspace_size(arguments);
-      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-      gemm_op.can_implement(arguments);
-
-      gemm_op.initialize(arguments, workspace.get());
-
-      // Run the GEMM
-      gemm_op.run();
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Verify that the result is correct
-      bool passed = verify(problem_size, options.alpha, options.beta);
-      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-      if (passed && options.iterations > 0) {
-        GPU_Clock timer;
-        timer.start();
-        for (int i = 0; i < options.iterations; ++i) {
-          gemm_op.run();
-        }
-
-        float cute_time = timer.seconds() / options.iterations;
-        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
-        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
-                  << std::endl;
-        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time * 1000);
-      }
-    }
-};
-
-template <class Gemm>
-struct PvcExampleRunner : ExampleRunner<Gemm> {
-    using Base = ExampleRunner<Gemm>;
-
-    using ElementB = typename Base::ElementB;
-
-    using ProblemShapeType = typename Base::ProblemShapeType;
-
-    cutlass::DeviceAllocation<ElementB> block_B_vnni;
-
-    template <typename T>
-    void vnni_matrix(
-            T* dst, const T* src,
-            int batch, int numRows, int numCols, int factor)
-    {
-      for (int b = 0; b < batch; b++) {
-        for (int r = 0; r < numRows / factor; r++) {
-          for (int c = 0; c < numCols; c++) {
-            for (int k = 0; k < factor; k++) {
-              dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
-                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
-            }
-          }
-        }
-      }
-    }
-
-    void initialize(const ProblemShapeType& problem_size) override {
-      Base::initialize(problem_size);
-
-      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
-      auto [M, N, K, L] = problem_shape_MNKL;
-
-      block_B_vnni.reset(Base::block_B.size());
-
-      std::vector<ElementB> b(K * N * L);
-      std::vector<ElementB> b_vnni(b.size());
-
-      Base::block_B.copy_to_host(b.data());
-      vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
-
-      block_B_vnni.copy_from_host(b_vnni.data());
-    }
-
-    void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override {
-      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
-
-      initialize(problem_size);
-
-      typename Gemm::GemmKernel::Arguments arguments{
-              cutlass::gemm::GemmUniversalMode::kGemm,
-              problem_size,
-              {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B},
-              {
-                {options.alpha, options.beta},
-                Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D
-              },
-              hw_info
-      };
-
-      Gemm gemm_op;
-
-      size_t workspace_size = Gemm::get_workspace_size(arguments);
-      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-      gemm_op.can_implement(arguments);
-
-      gemm_op.initialize(arguments, workspace.get());
-
-      // Run the GEMM
-      gemm_op.run();
-
-#if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::wait();
-#else
-      cudaDeviceSynchronize();
-#endif
-
-      // Verify that the result is correct
-      bool passed = Base::verify(problem_size, options.alpha, options.beta);
-      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-      if (passed && options.iterations > 0) {
-        GPU_Clock timer;
-        timer.start();
-        for (int i = 0; i < options.iterations; ++i) {
-          gemm_op.run();
-        }
-
-        float cute_time = timer.seconds() / options.iterations;
-        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
-        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
-        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
-      }
-    }
-};
-