diff --git a/.github/workflows/cuda_test.yml b/.github/workflows/cuda_test.yml
index 2b839dcfa7..c4908b943f 100644
--- a/.github/workflows/cuda_test.yml
+++ b/.github/workflows/cuda_test.yml
@@ -9,6 +9,10 @@ on:
 
 permissions: {}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   run-tests:
     name: Run cuda tests
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 33b255c942..77cfb712be 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,6 +13,10 @@ on:
 
 permissions: {}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   run-tests:
     name: Run tests
@@ -71,13 +75,4 @@ jobs:
         shell: bash
         run: |
           export LD_LIBRARY_PATH=~/dpcpp/lib/:$LD_LIBRARY_PATH
-          echo Run sgemm_1
-          ./examples/cute/tutorial/sgemm_1
-          echo Run sgemm_2
-          ./examples/cute/tutorial/sgemm_2
-          echo Run sgemm_sm70
-          ./examples/cute/tutorial/sgemm_sm70
-          echo Run sgemm_sm80
-          ./examples/cute/tutorial/sgemm_sm80
-          echo Run tiled_copy
-          ./examples/cute/tutorial/tiled_copy
+          cmake --build . --target test_examples -j 24
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9187927b13..4f67d0b123 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,6 +94,10 @@ endif()
 set(CUTLASS_ENABLE_SYCL OFF CACHE BOOL "Enable SYCL")
 set(CUTLASS_SYCL_PROFILING_ENABLED OFF CACHE BOOL "Use SYCL events to calculate device execution time")
 
+set(CUTLASS_SYCL_SWITCH_WG OFF CACHE BOOL "Enable SWITCH WG and for GEMM on Intel PVC during benchmarking")
+if(CUTLASS_SYCL_SWITCH_WG)
+  add_compile_definitions(CUTLASS_SYCL_SWITCH_WG)
+endif()
 if (CUTLASS_ENABLE_SYCL)
   set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
diff --git a/benchmarks/pvc/benchmarks.hpp b/benchmarks/pvc/benchmarks.hpp
index a8ebc0b678..3745a01085 100644
--- a/benchmarks/pvc/benchmarks.hpp
+++ b/benchmarks/pvc/benchmarks.hpp
@@ -34,15 +34,62 @@
 #include "../benchmark_runner.hpp"
 #include "gemm_configuration.hpp"
 
-using PvcGemmBF16BF16FP32_RRR = cutlass::gemm::device::GemmConfiguration<
+using MMAAtom = MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>;
+using PvcGemmBF16BF16FP32_RRR_1 = cutlass::gemm::device::GemmConfiguration<
         cutlass::arch::IntelPVC,
         cutlass::bfloat16_t, cutlass::layout::RowMajor,
         cutlass::bfloat16_t, cutlass::layout::RowMajor,
         float, cutlass::layout::RowMajor,
-        float>;
+        float, Shape<_256, _256, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_8,_4,_1>>>,
+        XE_2D_U16x32x32_LD_N, XE_2D_U16x32x32_LD_V>;
 
-CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR);
+using PvcGemmBF16BF16FP32_RRR_2 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+        float, Shape<_128, _512, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_4,_8,_1>>>,
+        XE_2D_U16x32x32_LD_N, XE_2D_U16x32x32_LD_V>;
+
+using PvcGemmBF16BF16FP32_RRR_3 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+                float, Shape<_256, _128, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_8,_4,_1>>>,
+        XE_2D_U16x32x32_LD_N, XE_2D_U16x32x32_LD_V>;
+
+using PvcGemmBF16BF16FP32_RRR_4 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+        float, Shape<_128, _256, _16>,
+        TiledMMA<MMAAtom, Layout<Shape<_4,_8,_1>>>,
+        XE_2D_U16x32x16_LD_N, XE_2D_U16x16x32_LD_V>;
+
+using PvcGemmBF16BF16FP32_RRR_5 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+        float, Shape<_8, _128, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_1,_4,_1>>>,
+        XE_2D_U16x8x32_LD_N, XE_2D_U16x32x32_LD_V>;
+
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_1);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_2);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_3);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_4);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_5);
 
 static void register_benchmarks() {
-  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_1);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_2);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_3);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_4);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_5);
 }
diff --git a/benchmarks/pvc/gemm_configuration.hpp b/benchmarks/pvc/gemm_configuration.hpp
index 3a07857f5e..c687c76afe 100644
--- a/benchmarks/pvc/gemm_configuration.hpp
+++ b/benchmarks/pvc/gemm_configuration.hpp
@@ -57,7 +57,9 @@ template<
   class ElementA, class LayoutA,
   class ElementB, class LayoutB,
   class ElementC, class LayoutC,
-  class ElementAccumulator>
+  class ElementAccumulator,
+  class TileShape, class TiledMma,
+  class GmemTiledCopyA, class GmemTiledCopyB>
 struct GemmConfiguration {
   static_assert(sizeof(ElementA) == 0, "No valid GemmConfiguration configuration exists.");
 };
@@ -66,47 +68,16 @@ struct GemmConfiguration {
 
 // bfloat16
 
-namespace detail {
-
-template<typename Element, typename Layout>
-struct Gemm_OperandA;
-
-template<typename Element, typename Layout>
-struct Gemm_OperandB;
-
-template<>
-struct Gemm_OperandA<bfloat16_t, layout::RowMajor> {
-  using GmemTiledCopy = XE_2D_U16x32x32_LD_N;
-};
-
-template<>
-struct Gemm_OperandB<bfloat16_t, layout::RowMajor> {
-  using GmemTiledCopy = XE_2D_U16x32x32_LD_V;
-};
-
-} // namespace details
-
-template<typename LayoutA, typename LayoutB, typename LayoutC>
+template<typename LayoutA, typename LayoutB, typename LayoutC,
+  class TileShape, class TiledMma, class GmemTiledCopyA, class GmemTiledCopyB>
 struct GemmConfiguration<
       arch::IntelPVC,
       bfloat16_t, LayoutA,
       bfloat16_t, LayoutB,
       float, LayoutC,
-      float> {
-  using TileShape = Shape<_256, _256, _32>;
-  using DispatchPolicy = MainloopIntelPVC<3>;;
-  using TiledMma = TiledMMA<
-    MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-    Layout<Shape<_8,_4,_1>>,
-    Tile<_64,_64,_32>>;
-
-  // A
-  using OperandA = detail::Gemm_OperandA<bfloat16_t, LayoutA>;
-  using GmemTiledCopyA = typename OperandA::GmemTiledCopy;
-
-  // B
-  using OperandB = detail::Gemm_OperandB<bfloat16_t, LayoutB>;
-  using GmemTiledCopyB = typename OperandB::GmemTiledCopy;
+      float, TileShape, TiledMma,
+      GmemTiledCopyA, GmemTiledCopyB> {
+  using DispatchPolicy = MainloopIntelPVC<3>;
 
   // Mainloop
   using CollectiveMainloop = collective::CollectiveMma<
diff --git a/benchmarks/pvc/input.in b/benchmarks/pvc/input.in
index 8e68fcd566..4f5d47648c 100644
--- a/benchmarks/pvc/input.in
+++ b/benchmarks/pvc/input.in
@@ -1,17 +1,23 @@
 # BFloat16 benchmarks
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=32768
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=32768 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=1024
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=1024 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=4096
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=4096 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=4096 --k=16384 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=4096
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=1024 --k=16384 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=1024
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=128 --n=16384
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=16384 --n=128
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=128 --n=4096
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=4096 --n=128
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=32 --m=4096 --k=4096 --n=128
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=4096 --k=4096 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=8192 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=1 --k=5120 --n=13824
+PvcGemmBF16BF16FP32_RRR_2 --bm_name=bf16_bf16_fp32 --l=1 --m=1024 --k=28672 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=3072 --k=4096 --n=3072
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=4 --k=4096 --n=12288
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=32768
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=32768 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=1024
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=1024 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=4096 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=4096 --k=16384 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=1024 --k=16384 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=1024
+PvcGemmBF16BF16FP32_RRR_4 --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=128 --n=16384
+PvcGemmBF16BF16FP32_RRR_5 --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=16384 --n=128
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=128 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=4096 --n=128
+PvcGemmBF16BF16FP32_RRR_3 --bm_name=bf16_bf16_fp32 --l=32 --m=4096 --k=4096 --n=128
diff --git a/examples/35_gemm_softmax/CMakeLists.txt b/examples/35_gemm_softmax/CMakeLists.txt
index b7ecd99fcc..824453a656 100644
--- a/examples/35_gemm_softmax/CMakeLists.txt
+++ b/examples/35_gemm_softmax/CMakeLists.txt
@@ -29,8 +29,14 @@
 
 
 
+if (NOT CUTLASS_ENABLE_SYCL)
 cutlass_example_add_executable(
   35_gemm_softmax
   gemm_softmax.cu
   )
-
+else()
+cutlass_example_add_executable(
+  35_gemm_online_softmax
+  gemm_online_softmax.cpp
+  )
+endif()
diff --git a/examples/35_gemm_softmax/gemm_online_softmax.cpp b/examples/35_gemm_softmax/gemm_online_softmax.cpp
new file mode 100644
index 0000000000..b8844286fb
--- /dev/null
+++ b/examples/35_gemm_softmax/gemm_online_softmax.cpp
@@ -0,0 +1,604 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief GEMM + Softmax example using Cute and CUTLASS 3.x APIs for NVIDIA Ampere architecture
+
+    This example demonstrate how to instantiate and run a TF32 GEMM using the Cute and
+    CUTLASS 3.x APIs on NVIDIA Ampere architecture. Please check example 07 and 08 for
+    the basics of tensor op gemm kernels.
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#if defined(CUTLASS_ENABLE_SYCL)
+#include "cutlass/util/reference/device/sycl_tensor_fill.h"
+#else
+#include "cutlass/util/reference/device/tensor_fill.h"
+#endif
+#include "helper.h"
+#include "softmax_epilogue.hpp"
+#include "gemm_softmax_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using namespace cute;
+
+/// Result structure
+struct Result {
+
+  double avg_runtime_ms;
+  double gflops;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double avg_runtime_ms = 0,
+    double gflops = 0)
+  :
+    avg_runtime_ms(avg_runtime_ms), gflops(gflops), passed(false)
+  {}
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  int m, n, k, l;
+  float alpha, beta;
+  int iterations;
+  float tolerance;
+
+  Options():
+    help(false),
+    m(5120), n(4096), k(4096), l(1),
+    alpha(1), beta(0),
+    iterations(100),
+    tolerance(1e-5f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 5120);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("tolerance", tolerance);
+
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "35_gemm_online_softmax example\n\n"
+      << "  This example uses the CUTLASS Library to execute TF32 tensorop GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
+      << "  --tolerance <float>         Error tolerance\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k * l;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+        cutlass::DeviceAllocation<Element>& block,
+        uint64_t seed=2023) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  } else if (bits_input <= 8) {
+    scope_max = 2;
+    scope_min = -2;
+  } else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+        block.get(), block.size(), seed, scope_max, scope_min, 0);
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Wrapper to run and verify a GEMM.
+template <
+        class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StridePartials = typename Gemm::CollectiveEpilogue::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+  using LayoutPartials = typename Gemm::LayoutPartials;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementD = typename Gemm::ElementD;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  StridePartials stride_partials;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementAcc> block_max;
+  cutlass::DeviceAllocation<ElementAcc> block_sum;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+  template<typename Element>
+  bool verify_tensor(std::vector<Element> vector_Input,
+                       std::vector<Element> vector_Input_Ref, const Options& options) {
+
+    auto size = int64_t((vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size());
+    float abs_tol = options.tolerance;
+    float rel_tol = options.tolerance;
+    
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i));
+      float relative_diff = abs_ref > abs_tol ? abs_diff / abs_ref : 0;
+      if ( (isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
+        printf("i = %d diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        return false;
+      }
+
+    }
+
+    return true;
+  }
+
+  /// Verifies the reference matches
+  bool verify(const Options& options) {
+    using ElementSoftmax = ElementD;
+
+    cutlass::gemm::GemmCoord problem_size = cutlass::gemm::GemmCoord{options.m, options.n, options.k};
+
+    int64_t total_elements_A_per_batch = options.m * options.k;
+    int64_t total_elements_B_per_batch = options.k * options.n;
+    int64_t total_elements_C_per_batch = options.m * options.n;
+    int64_t total_elements_D_per_batch = total_elements_C_per_batch;
+
+    int64_t lda = LayoutA::packed({options.m, options.k}).stride(0);
+    int64_t ldb = LayoutB::packed({options.k, options.n}).stride(0);
+    int64_t ldc = LayoutC::packed({options.m, options.n}).stride(0);
+
+    int64_t ldn = options.m;
+    int64_t lds = ldn;
+
+    LayoutA layout_A(lda);
+    LayoutB layout_B(ldb);
+    LayoutC layout_C(ldc);
+    LayoutPartials Layout_N(ldn);
+    LayoutPartials Layout_S(lds);
+
+    cutlass::MatrixCoord extent_A{options.m, options.k};
+    cutlass::MatrixCoord extent_B{options.k, options.n};
+    cutlass::MatrixCoord extent_C{options.m, options.n};
+    
+    cutlass::HostTensor<ElementSoftmax, LayoutC>     reference_N;
+    reference_N.reset({options.m, 1}, false);
+
+    for (int batch_idx = 0; batch_idx < options.l; batch_idx++) {
+      cutlass::TensorView<ElementA, LayoutA> view_A(block_A.get() + total_elements_A_per_batch * batch_idx, layout_A, extent_A);
+      cutlass::TensorView<ElementB, LayoutB> view_B(block_B.get() + total_elements_B_per_batch * batch_idx, layout_B, extent_B);
+      cutlass::TensorView<ElementC, LayoutC> view_C(block_C.get() + total_elements_C_per_batch * batch_idx, layout_C, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_Ref_device(block_ref_D.get(), layout_C, extent_C);
+
+      cutlass::reference::device::GemmComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC, 
+          ElementCompute, ElementCompute, ElementD
+      >(
+        problem_size,
+        options.alpha, 
+        view_A,
+        cutlass::ComplexTransform::kNone,
+        view_B,
+        cutlass::ComplexTransform::kNone,
+        options.beta, 
+        view_C, 
+        view_Ref_device, 
+        ElementCompute(0)
+      );
+
+      // Copy reference results to host memory for verification
+      std::vector<ElementD> matrix_D_Ref(layout_C.capacity(extent_C));
+      cutlass::device_memory::copy_to_host(matrix_D_Ref.data(), block_ref_D.get(), matrix_D_Ref.size());
+      cutlass::TensorView<ElementD, LayoutC> view_D_Ref(matrix_D_Ref.data(), layout_C, extent_C);
+
+      std::vector<ElementSoftmax> matrix_Softmax_Ref(layout_C.capacity(extent_C));
+      cutlass::TensorView<ElementSoftmax, LayoutC> view_Softmax_Ref(matrix_Softmax_Ref.data(), layout_C, extent_C);
+
+      // Copy computed results to host memory
+      std::vector<ElementD> matrix_D(layout_C.capacity(extent_C));
+      cutlass::device_memory::copy_to_host(matrix_D.data(), block_D.get() + total_elements_D_per_batch * batch_idx, matrix_D.size());
+
+      // Compute the norm
+      for (int m = 0; m < options.m; ++m) {
+        reference_N.at({m, 0}) = view_D_Ref.ref().at({m, 0});
+        for (int n = 1; n < options.n; ++n) {
+          reference_N.at({m, 0}) = std::max(reference_N.at({m, 0}), ElementSoftmax(view_D_Ref.ref().at({m, n})));
+        }
+      }
+
+      // Compute softmax
+      for (int m = 0; m < options.m; ++m) {
+        float sum = 0;
+        for (int n = 0; n < options.n; ++n) {
+          sum += std::exp( float(view_D_Ref.ref().at({m, n})) - float(reference_N.at({m, 0})) );
+        }
+        float inv_sum = float(1.0f / sum);
+
+        for (int n = 0; n < options.n; ++n) {
+          view_Softmax_Ref.ref().at({m, n}) = ElementSoftmax(
+            std::exp( float(view_D_Ref.ref().at({m, n})) - float(reference_N.at({m, 0})) ) * inv_sum
+          );
+        }
+      }
+
+      bool verified_Softmax = verify_tensor<ElementSoftmax>(matrix_D, matrix_Softmax_Ref, options);
+      if (!verified_Softmax) {
+        std::cerr << "Verification of Softmax tensor failed\n";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto partials_N = cute::ceil_div(N, cute::shape<1>(typename Gemm::TileShape{}));
+    auto partials_size = M * partials_N * L;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+    stride_partials = cutlass::make_cute_packed_stride(StridePartials{}, cute::make_shape(M, partials_N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+    block_sum.reset(partials_size);
+    block_max.reset(partials_size);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+            cutlass::gemm::GemmUniversalMode::kGemm,
+            problem_size,
+            {block_A.get(), stride_A, block_B.get(), stride_B},
+            {{options.alpha,
+             options.beta},
+             block_C.get(), stride_C, 
+             block_D.get(), stride_D, 
+             block_max.get(), block_sum.get(), stride_partials},
+            hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    CUTLASS_CHECK(gemm_op.can_implement(arguments));
+
+    CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
+
+    // Run the GEMM
+    CUTLASS_CHECK(gemm_op.run());
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    Result result;
+    result.passed = verify(options);
+
+    std::cout << "  Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
+
+    if (!result.passed) {
+      exit(-1);
+    }
+
+    // Run profiling loop
+    if (options.iterations > 0)
+    {
+      GpuTimer timer;
+      timer.start();
+      for (int iter = 0; iter < options.iterations; ++iter) {
+        CUTLASS_CHECK(gemm_op.run());
+      }
+      timer.stop();
+
+      // Compute average runtime and GFLOPs.
+      float elapsed_ms = timer.elapsed_millis();
+      result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+      result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
+
+      std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x'
+                << options.k << 'x' << options.l << std::endl;
+      std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+      std::cout << "  GFLOPS: " << result.gflops << std::endl;
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
+  // in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
+#if !defined(CUTLASS_ENABLE_SYCL)
+  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    return 0;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    return 0;
+  }
+#endif
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID.
+  // This information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  // Problem configuration
+  using ElementA = float;
+  using ElementB = float;
+  using ElementAcc = float;
+  using ElementOutput = float;
+
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+  using LayoutPartials = cutlass::layout::ColumnMajor;
+
+  // Tiling configuration selection
+  using TileShape = Shape<_128,_128,_32>;
+
+  //
+  // Assembling the CollectiveMainloop type
+  //
+
+  // Number of pipelines you want to use
+  constexpr int PipelineStages = 4;
+
+  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<PipelineStages>;
+
+  // This code section describes the MMA op and the tile size a warp will compute
+  using TiledMma = TiledMMA<
+    MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+    Layout<Shape<_2,_2,_1>, Stride<_2,_1,_1>>, // 2x2x1 thread group
+    Tile<_32,_32,_8>>;                         // 32x32x8 MMA for LDSM, 1x2x1 value group
+
+  // Define the copy layout and atom for device memory copy.
+  using GmemTiledCopyA = decltype(
+          make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, float>{},
+                          Layout<Shape<_16,_8>, Stride<_1,_16>>{},
+                          Layout<Shape<_4,_1>>{}));
+
+  using GmemTiledCopyB = decltype(
+          make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, float>{},
+                          Layout<Shape<_16,_8>, Stride<_8,_1>>{},
+                          Layout<Shape<_1,_4>>{}));
+
+  // Define the copy layout and atom for shared memory copy.
+  using SmemLayoutAtomA = decltype(composition(Swizzle<2,3,2>{}, Layout<Shape <_32,_8>, Stride< _1,_32>>{}));
+  using SmemCopyAtomA = Copy_Atom<UniversalCopy<float>, float>;
+
+  using SmemLayoutAtomB = decltype(composition(Swizzle<3,2,3>{}, Layout<Shape <_8,_32>, Stride<_32, _1>>{}));
+  using SmemCopyAtomB = Copy_Atom<SM75_U32x4_LDSM_N, float>;
+
+  // Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementA,
+          cutlass::detail::TagToStrideA_t<LayoutA>,
+          ElementB,
+          cutlass::detail::TagToStrideB_t<LayoutB>,
+          TiledMma,
+          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  //
+  // Assembling the Collective Epilogue Type
+  //
+
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+                                                             // memory access. For a byte, it's 16
+                                                             // elements. This becomes the vector width of
+                                                             // math instructions in the epilogue too
+          ElementAcc,                                        // <- data type of accumulator
+          ElementAcc>;  // <- data type for alpha/beta in linear combination function
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::SoftmaxEpilogue<
+          cutlass::detail::TagToStrideC_t<LayoutC>,
+          cutlass::detail::TagToStrideC_t<LayoutD>,
+          cutlass::detail::TagToStrideC_t<LayoutPartials>,
+          TileShape,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+  //
+  // Assembling the GemmKernel
+  //
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+          Shape<int,int,int,int>,
+          CollectiveMainloop,
+          CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmSoftmaxAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/examples/35_gemm_softmax/gemm_softmax_adapter.hpp b/examples/35_gemm_softmax/gemm_softmax_adapter.hpp
new file mode 100644
index 0000000000..13e0369e4e
--- /dev/null
+++ b/examples/35_gemm_softmax/gemm_softmax_adapter.hpp
@@ -0,0 +1,543 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
+    batched array variants.
+*/
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+// 3.x
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#if defined(CUTLASS_ENABLE_SYCL)
+#include "cutlass/util/sycl_event_manager.hpp"
+#endif
+
+#include "softmax_finalize.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <class GemmKernel_>
+class GemmSoftmaxAdapter
+{
+public:
+  using GemmKernel = GemmKernel_;
+  using TileShape = typename GemmKernel::TileShape;
+  using ElementA = typename GemmKernel::ElementA;
+  using ElementB = typename GemmKernel::ElementB;
+  using ElementC = typename GemmKernel::ElementC;
+  using ElementD = typename GemmKernel::ElementD;
+  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
+  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
+  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
+
+  using SoftmaxFinalizeKernel = reduction::kernel::SoftmaxFinalize<
+                                          ElementD, typename GemmKernel::StrideD,
+                                          ElementAccumulator, typename GemmKernel::CollectiveEpilogue::StridePartials,
+                                          ElementD, typename GemmKernel::StrideD>;
+
+  // Map back to 2.x type as best as possible
+  using LayoutA = gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
+  using LayoutB = gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
+  using LayoutC = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
+  using LayoutD = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
+  using LayoutPartials = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  static ComplexTransform const kTransformA = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformA, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformB, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+
+  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
+  using MathOperator = cutlass::arch::OpMultiplyAdd;
+
+  using OperatorClass = cutlass::detail::get_operator_class_t<typename CollectiveMainloop::TiledMma>;
+
+  using ArchTag = typename GemmKernel::ArchTag;
+
+  // NOTE: Assume identity swizzle for now
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA, typename CollectiveMainloop::TiledMma::ValTypeA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB, typename CollectiveMainloop::TiledMma::ValTypeB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  // Split-K preserves splits that are 128b aligned
+  static int constexpr kSplitKAlignment = cute::max(
+      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  /// Argument structure: User API
+  using Arguments = typename GemmKernel::Arguments;
+
+  struct Params{
+    typename GemmKernel::Params gemm_params;
+    typename SoftmaxFinalizeKernel::Params softmax_params;
+  };
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (GemmKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) * size_t(cute::size<1>(TileShape{}));
+    }
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += GemmKernel::get_workspace_size(args);
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = GemmKernel::to_underlying_arguments(args, workspace);
+    return GemmKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return GemmKernel::get_grid_shape(params.gemm_params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<GemmKernel>,
+        GemmKernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  void initialize_softmax_params(Arguments const& args, typename SoftmaxFinalizeKernel::Arguments& softmax_args){
+    softmax_args.M = get<0>(args.problem_shape);
+    softmax_args.dataN = get<1>(args.problem_shape);
+    softmax_args.partialN = cute::ceil_div(get<1>(args.problem_shape), cute::shape<1>(TileShape{}));
+    softmax_args.batch_count = get<3>(args.problem_shape);
+    softmax_args.dInput = args.epilogue.dD;
+    softmax_args.dPartial = args.epilogue.dPartials;
+    softmax_args.dOutput = args.epilogue.dD;
+    softmax_args.ptr_in = args.epilogue.ptr_D;
+    softmax_args.ptr_partial_max = args.epilogue.ptr_max;
+    softmax_args.ptr_partial_sum = args.epilogue.ptr_sum;
+    softmax_args.ptr_out = args.epilogue.ptr_D;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = GemmKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_.gemm_params = GemmKernel::to_underlying_arguments(args, workspace);
+    initialize_softmax_params(args, params_.softmax_params.args);
+
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = GemmKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+#if !defined(CUTLASS_ENABLE_SYCL)
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<GemmKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+#endif
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.gemm_params = GemmKernel::to_underlying_arguments(args, workspace);
+    initialize_softmax_params(args, params_.softmax_params.args);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling GemmKernel::to_underling_arguments()
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("GemmUniversal::run()");
+    dim3 const block = GemmKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+    dim3 const block_finalize = syclcompat::dim3(NumThreadsPerWarp, 
+                                                 std::min(MaxNumThreadsPerBlock / NumThreadsPerWarp, 
+                                                          params.softmax_params.args.M), 
+                                                 1);
+    dim3 const grid_finalize = syclcompat::dim3(cute::ceil_div(params.softmax_params.args.M, block_finalize.x), 
+                                                params.softmax_params.args.batch_count, 
+                                                1);
+
+    // configure smem size and carveout
+    int smem_size = GemmKernel::SharedStorageSize;
+    int smem_size_finalize = SoftmaxFinalizeKernel::SharedStorageSize;
+
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability >= 90) {
+#if !defined(CUTLASS_ENABLE_SYCL)
+      constexpr bool is_static_1x1x1 = cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape> and
+                                       cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
+      dim3 cluster(cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
+      dim3 cluster_finalize(1,1,1);
+      void* kernel_params[] = {&params.gemm_params};
+      void* kernel_params_finalize[] = {&params.softmax_params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "GemmUniversal::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster,
+                                               block,
+                                               smem_size,
+                                               stream,
+                                               kernel_params,
+                                               0);
+          launch_result = cuda_adapter->launch(grid_finalize,
+                                               cluster_finalize,
+                                               block_finalize,
+                                               smem_size_finalize,
+                                               stream,
+                                               kernel_params_finalize,
+                                               1);
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<GemmKernel>;
+        void const* kernel_finalize = (void const*) device_kernel<SoftmaxFinalizeKernel>;
+        if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 90) {
+          if (is_static_1x1x1 && not launch_with_pdl) {
+            device_kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params.gemm_params);
+            device_kernel<SoftmaxFinalizeKernel><<<grid_finalize, block_finalize, smem_size_finalize, stream>>>(params.softmax_params);
+          }
+          else {
+            launch_result = ClusterLauncher::launch(
+              grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+            launch_result = ClusterLauncher::launch(
+              grid_finalize, cluster_finalize, block_finalize, smem_size_finalize, stream, kernel_finalize, kernel_params, launch_with_pdl);
+          }
+        }
+      }
+#endif
+    }
+    else {
+      launch_result = Status::kSuccess;
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params.gemm_params};
+          void* kernel_params_finalize[] = {&params.softmax_params};
+
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+          launch_result = cuda_adapter->launch(
+            grid_finalize, block_finalize, smem_size_finalize, stream, kernel_params_finalize, 1
+          );
+
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+#if defined(CUTLASS_ENABLE_SYCL)
+        const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+        const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+        using namespace syclcompat::experimental;
+#if defined (SYCL_INTEL_TARGET)
+        auto event = launch<device_kernel<GemmKernel>>(launch_policy{
+          sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}, 
+          kernel_properties{sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>}
+        }, params.gemm_params);
+#else
+        auto event = launch<device_kernel<GemmKernel>>(launch_policy{
+          sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}},
+          params.gemm_params);
+#endif
+        const auto sycl_block_finalize = syclcompat::dim3(block_finalize.x, block_finalize.y, block_finalize.z);
+        const auto sycl_grid_finalize = syclcompat::dim3(grid_finalize.x, grid_finalize.y, grid_finalize.z);
+        auto event2 = launch<device_kernel<SoftmaxFinalizeKernel>>(launch_policy{
+          sycl_grid_finalize, sycl_block_finalize, local_mem_size{static_cast<std::size_t>(smem_size_finalize)}},
+          params.softmax_params);
+        EventManager::getInstance().addEvent(event2);
+#else
+        device_kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params.gemm_params);
+        device_kernel<SoftmaxFinalizeKernel><<<grid_finalize, block_finalize, smem_size_finalize, stream>>>(params.softmax_params);
+#endif
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/35_gemm_softmax/softmax_epilogue.hpp b/examples/35_gemm_softmax/softmax_epilogue.hpp
new file mode 100644
index 0000000000..438552a7f2
--- /dev/null
+++ b/examples/35_gemm_softmax/softmax_epilogue.hpp
@@ -0,0 +1,300 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies an element wise operation to all elements within the fragment
+/// and writes them out to destination storage.
+template <
+  class StrideC_,
+  class StrideD_,
+  class StridePartials_,
+  class BlockShapeMNK,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_
+>
+class SoftmaxEpilogue {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  using DispatchPolicy = EpilogueSchedule_;
+
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using StridePartials = StridePartials_;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { 
+    cute::array_aligned<ElementAccumulator, get<0>(BlockShapeMNK{}) * get<1>(BlockShapeMNK{})> smem_c;
+  };
+
+  using TensorStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementAccumulator* ptr_max;
+    ElementAccumulator* ptr_sum;
+    StridePartials dPartials{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  SoftmaxEpilogue(Params const& params_, SharedStorage const& shared_storage = SharedStorage())
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> & accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem and tile shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto M_tile = get<0>(blk_shape_MNK);
+    auto N_tile = get<1>(blk_shape_MNK);
+    auto K_tile = get<2>(blk_shape_MNK);
+
+    auto N_partials = cute::ceil_div(N, N_tile);
+
+    cute::packed_tuple partial_block(M_tile, K_tile);
+
+    auto stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+
+    // Represent the full output tensors
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c);                 // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                 // (m,n,l)
+    Tensor mMax_mnl = make_tensor(make_gmem_ptr(params.ptr_max), make_shape(M,N_partials,L), params.dPartials);
+    Tensor mSum_mnl = make_tensor(make_gmem_ptr(params.ptr_sum), make_shape(M,N_partials,L), params.dPartials);
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
+    Tensor gMax_mnl = local_tile(mMax_mnl, partial_block, make_coord(_,_), Step<_1, X>{});
+    Tensor gSum_mnl = local_tile(mSum_mnl, partial_block, make_coord(_,_), Step<_1, X>{});
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
+    Tensor gMax = gMax_mnl(_,m_coord,n_coord,l_coord);
+    Tensor gSum = gSum_mnl(_,m_coord,n_coord,l_coord);
+
+    //Represent the shared tensor
+    Tensor sC = make_tensor(make_smem_ptr(reinterpret_cast<ElementAccumulator*>(smem_buf)), make_layout(make_shape(M_tile, N_tile)));
+
+    // Partition the tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
+    Tensor tCsC = thr_mma.partition_C(sC);                                       // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Make an identity coordinate tensor for predicating our output MN tile
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    if(is_source_needed()){
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<0>(accumulators); ++i) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < size<1>(accumulators); ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int k = 0; k < size<2>(accumulators); ++k) {
+            if (elem_less(tCcD(i,j,k), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+              accumulators(i,j,k) = epilogue_op(accumulators(i,j,k), tCgC(i,j,k));
+              tCgD(i,j,k) = accumulators(i,j,k);
+              tCsC(i,j,k) = accumulators(i,j,k);
+            }
+          }
+        }
+      }
+    } else{
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<0>(accumulators); ++i) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < size<1>(accumulators); ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int k = 0; k < size<2>(accumulators); ++k) {
+            if (elem_less(tCcD(i,j,k), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+              accumulators(i,j,k) = epilogue_op(accumulators(i,j,k));
+              tCgD(i,j,k) = accumulators(i,j,k);
+              tCsC(i,j,k) = accumulators(i,j,k);
+            } 
+          }
+        }
+      }
+    }
+
+    syncthreads();
+
+    // assumption for reductions: size<0>(sC) == block size
+    assert(size<0>(sC) == BlockDimX() * BlockDimy() * BlockDimZ());
+    
+    ElementAccumulator max = std::numeric_limits<ElementAccumulator>::lowest();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(sC); ++i) {
+      if (elem_less(cD(thread_idx, i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+        accumulators(i) = sC(thread_idx, i);
+        max = cutlass::fast_max(max, accumulators(i));
+      }
+    }
+    gMax(thread_idx) = max;
+    
+    ElementAccumulator sum = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(sC); ++i) {
+      if (elem_less(cD(thread_idx, i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+        sum += cutlass::fast_exp(accumulators(i) - max);
+      }
+    }
+    gSum(thread_idx) = sum;
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/35_gemm_softmax/softmax_finalize.hpp b/examples/35_gemm_softmax/softmax_finalize.hpp
new file mode 100644
index 0000000000..ca6e6ac93a
--- /dev/null
+++ b/examples/35_gemm_softmax/softmax_finalize.hpp
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a final calculation of softmax
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+template <
+  typename ElementInput_,
+  typename StrideInput_,
+  typename ElementPartial_,
+  typename StridePartial_,
+  typename ElementOutput_,
+  typename StrideOutput_
+>
+class SoftmaxFinalize {
+public:
+
+  using ElementInput = ElementInput_;
+  using StrideInput = StrideInput_;
+  using ElementPartial = ElementPartial_;
+  using StridePartial = StridePartial_;
+  using ElementOutput = ElementOutput_;
+  using StrideOutput = StrideOutput_;
+
+  //
+  // Arguments
+  //
+
+  struct Arguments {
+    int                            M; // dimension M of input, output and partially reduced tensors
+    int                        dataN; // dimension N of the input and output
+    int                     partialN; // dimension N of the partially reduced tensors
+    int                  batch_count; // batch count
+    StrideInput               dInput; // stride of the input
+    StridePartial           dPartial; // stride of the partially reduced tensors
+    StrideOutput             dOutput; // stride of the output
+    ElementInput*             ptr_in; // pointer to start of input data
+    ElementPartial*  ptr_partial_max; // pointer to start of partially reduced max data
+    ElementPartial*  ptr_partial_sum; // pointer to start of partially reduced sum data
+    ElementOutput*           ptr_out; // pointer to start of output data
+  };
+
+  struct SharedStorage {
+    cute::array_aligned<ElementPartial, MaxNumThreadsPerBlock> s_mem;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  //
+  // Params struct
+  //
+
+  struct Params {
+    Arguments args;
+
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args_): args(args_) { }
+  };
+
+public:
+
+  CUTLASS_DEVICE
+  SoftmaxFinalize() { }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, char* shared_storage) {
+    apply(params, shared_storage);
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void apply(Params const &params, char* shared_storage) {
+    using ConvertInput = cutlass::NumericConverter<ElementInput, ElementPartial>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementPartial, ElementOutput>;
+
+    const int idx_x = ThreadIdxX();
+    const int m = idx_x + BlockDimX() * BlockIdxX();
+    const int idx_y = ThreadIdxY();
+    const int y_size = BlockDimY();
+    const int batch_id = BlockIdxY();
+
+    if (m >= params.args.M) {
+      return;
+    }
+
+    // Represent the full tensors
+    auto IOTensorShape = make_shape(params.args.M, params.args.dataN, params.args.batch_count);
+    auto PartialTensorShape = make_shape(params.args.M, params.args.partialN, params.args.batch_count);
+    Tensor mPartialMax = make_tensor(make_gmem_ptr(params.args.ptr_partial_max), PartialTensorShape, params.args.dPartial);
+    Tensor mPartialSum = make_tensor(make_gmem_ptr(params.args.ptr_partial_sum), PartialTensorShape, params.args.dPartial);
+    Tensor mOut = make_tensor(make_gmem_ptr(params.args.ptr_out), IOTensorShape, params.args.dOutput);
+    Tensor mIn = make_tensor(make_gmem_ptr(params.args.ptr_in), IOTensorShape, params.args.dInput);
+
+    //Represent the shared tensor
+    Tensor sPartial = make_tensor(make_smem_ptr(reinterpret_cast<ElementPartial*>(shared_storage)), 
+                                  make_layout(make_shape(NumThreadsPerWarp, MaxNumThreadsPerBlock / NumThreadsPerWarp)));
+
+    ElementPartial max_val = std::numeric_limits<ElementPartial>::lowest();
+    for (int partial_n = idx_y; partial_n < params.args.partialN; partial_n += y_size){
+        ElementPartial partial_max = mPartialMax(m, partial_n, batch_id);
+        max_val = cutlass::fast_max(max_val, partial_max);
+    }
+    sPartial(idx_x, idx_y) = max_val;
+    syncthreads();
+    // tree-reduction could be better, although it does not seem to be a bottleneck
+    for (int idx_y2 = 0; idx_y2 < y_size; idx_y2++){
+        ElementPartial partial_max = sPartial(idx_x, idx_y2);
+        max_val = cutlass::fast_max(max_val, partial_max);
+    }
+    
+    ElementPartial sum_val = 0;
+    for (int partial_n = idx_y; partial_n < params.args.partialN; partial_n += y_size){
+        ElementPartial partial_max = mPartialMax(m, partial_n, batch_id);
+        ElementPartial partial_sum = mPartialSum(m, partial_n, batch_id);
+        sum_val += partial_sum * cutlass::fast_exp(partial_max - max_val);
+    }
+    syncthreads();
+    sPartial(idx_x, idx_y) = sum_val;
+    syncthreads();
+    sum_val = 0;
+    // tree-reduction could be better, although it does not seem to be a bottleneck
+    for(int idx_y2 = 0; idx_y2 < y_size; idx_y2++){
+        ElementPartial partial_sum = sPartial(idx_x, idx_y2);
+        sum_val += partial_sum;
+    }
+
+    ElementPartial norm = 1 / sum_val;
+
+    for (int n = idx_y * 2; n < params.args.dataN; n += y_size * 2){
+      auto inVal = mIn(m, n, batch_id);
+      auto inVal2 = mIn(m, n+1, batch_id);
+      mOut(m, n, batch_id) = cutlass::fast_exp(inVal - max_val) * norm;
+      mOut(m, n+1, batch_id) = cutlass::fast_exp(inVal2 - max_val) * norm;
+    }
+    if (params.args.dataN % 2 == 1){
+      int n = params.args.dataN - 1;
+      auto inVal = mIn(m, n, batch_id);
+      mOut(m, n, batch_id) = cutlass::fast_exp(inVal - max_val) * norm;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b40a2ff59d..515a5617c4 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -161,6 +161,7 @@ if (NOT CUTLASS_ENABLE_SYCL)
 else()
   foreach(EXAMPLE
     14_ampere_tf32_tensorop_gemm
+    35_gemm_softmax
     cute
     sycl
     )
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index b736ce35e8..197ce58b2d 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -30,3 +30,7 @@
 if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
 endif()
+
+if (CUTLASS_ENABLE_SYCL)
+  add_subdirectory(device_agnostic)
+endif()
diff --git a/examples/sycl/device_agnostic/CMakeLists.txt b/examples/sycl/device_agnostic/CMakeLists.txt
new file mode 100644
index 0000000000..4bd75ca951
--- /dev/null
+++ b/examples/sycl/device_agnostic/CMakeLists.txt
@@ -0,0 +1,37 @@
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  device_agnostic_gemm
+  device_agnostic_gemm.cpp
+)
+
+cutlass_example_add_executable(
+  device_agnostic_collective_builder
+  device_agnostic_collective_builder.cpp
+)
diff --git a/examples/sycl/device_agnostic/device_agnostic_collective_builder.cpp b/examples/sycl/device_agnostic/device_agnostic_collective_builder.cpp
new file mode 100644
index 0000000000..44cb129465
--- /dev/null
+++ b/examples/sycl/device_agnostic/device_agnostic_collective_builder.cpp
@@ -0,0 +1,372 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/kernel_hardware_info.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "cutlass/util/reference/device/sycl_tensor_fill.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/coord.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l, iterations;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(128), n(128), k(128), l(1), iterations(100),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 128);
+    cmd.get_cmd_line_argument("n", n, 128);
+    cmd.get_cmd_line_argument("k", k, 128);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "Device Agnostic GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    syclcompat::wait();
+
+    using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(
+      block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  template <typename T>
+  void initialize_block(cutlass::DeviceAllocation<T> block_device, uint64_t seed) {
+    std::mt19937 rng(std::random_device{}());
+    std::uniform_real_distribution<> dist(0.0f, 1.0f);
+    rng.seed(seed);
+
+    auto block_host = std::vector<ElementA>(block_device.size());
+    for (auto& element : block_host) {
+      element = static_cast<T>(dist(rng));
+    }
+
+    block_device.copy_from_host(block_host.data());
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    gemm_op.can_implement(arguments);
+
+    gemm_op.initialize(arguments, workspace.get());
+
+    // Run the GEMM
+    gemm_op.run();
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (passed && options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        gemm_op.run();
+      }
+      syclcompat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of CUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = float;                        // <- data type of elements in input matrix A
+  using ElementInputB = float;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  constexpr int AlignmentA = sizeof(ElementInputA);
+  constexpr int AlignmentB = sizeof(ElementInputB);
+  constexpr int AlignmentC = sizeof(ElementAccumulator);
+  constexpr int AlignmentD = sizeof(ElementOutput);
+  
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  // Workgroup-level tile
+  using TileShape = Shape<_16, _16, _8>;
+  
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Agnostic, cutlass::arch::OpMultiplyAdd,
+    ElementInputA, LayoutA, AlignmentA,
+    ElementInputB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, Shape<_1, _1, _1>,
+    cutlass::gemm::collective::StageCountAuto,
+    cutlass::gemm::collective::KernelScheduleAuto
+  >::CollectiveOp;
+
+  using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<
+          ElementOutput, ElementComputeEpilogue, ElementAccumulator, 
+          ElementAccumulator>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Agnostic, cutlass::arch::OpMultiplyAdd,
+    TileShape, Shape<_1, _1, _1>,
+    cutlass::epilogue::collective::EpilogueTileAuto, ElementComputeEpilogue,
+    ElementAccumulator, 
+    ElementAccumulator, LayoutC, AlignmentC,
+    ElementOutput,      LayoutD, AlignmentD,
+    cutlass::epilogue::collective::EpilogueScheduleAuto,
+    EpilogueOp
+  >::CollectiveOp;
+  
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>,
+  CollectiveMainloop,
+  CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/examples/sycl/device_agnostic/device_agnostic_gemm.cpp b/examples/sycl/device_agnostic/device_agnostic_gemm.cpp
new file mode 100644
index 0000000000..142a2f2966
--- /dev/null
+++ b/examples/sycl/device_agnostic/device_agnostic_gemm.cpp
@@ -0,0 +1,382 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <vector>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/sycl_tensor_fill.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l, iterations;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(128), n(128), k(128), l(1), iterations(20),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 128);
+    cmd.get_cmd_line_argument("n", n, 128);
+    cmd.get_cmd_line_argument("k", k, 128);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "Device Agnostic GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Gemm>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    syclcompat::wait();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(
+      block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  template <typename T>
+  void initialize_block(cutlass::DeviceAllocation<T> block_device, uint64_t seed) {
+    std::mt19937 rng(std::random_device{}());
+    std::uniform_real_distribution<> dist(0.0f, 1.0f);
+    rng.seed(seed);
+
+    auto block_host = std::vector<ElementA>(block_device.size());
+    for (auto& element : block_host) {
+      element = static_cast<T>(dist(rng));
+    }
+
+    block_device.copy_from_host(block_host.data());
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    gemm_op.can_implement(arguments);
+
+    gemm_op.initialize(arguments, workspace.get());
+
+    // Run the GEMM
+    gemm_op.run();
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (passed && options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        gemm_op.run();
+      }
+      syclcompat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of CUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = float;                        // <- data type of elements in input matrix A
+  using ElementInputB = float;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using TileShape = Shape<_4, _4, _8>;
+
+  using TiledMma = TiledMMA<MMA_Atom<UniversalFMA<ElementOutput, ElementInputA, ElementInputB, ElementAccumulator>>,
+                            Layout<Shape<_4, _4, _1>>>;
+
+  using GmemTiledCopyA = decltype(
+        make_tiled_copy(Copy_Atom<UniversalCopy<ElementInputA>, ElementInputA>{},
+                        Layout<Shape<_4, _4>, Stride<_4, _1>>{},
+                        Layout<Shape<_1, _1>>{}
+        ));
+
+  using GmemTiledCopyB = decltype(
+        make_tiled_copy(Copy_Atom<UniversalCopy<ElementInputB>, ElementInputB>{},
+                        Layout<Shape<_4, _4>, Stride <_1, _4>>{},
+                        Layout<Shape<_1, _1>>{}
+        ));
+
+  using SmemLayoutAtomA = Layout<Shape<_4, _8>, Stride<_1, _4>>;
+  using SmemLayoutAtomB = Layout<Shape<_4, _8>, Stride<_1, _4>>;
+
+  using GEMMDispatchPolicy = cutlass::gemm::MainloopDeviceAgnostic;
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementAccumulator,
+          1,
+          ElementComputeEpilogue,
+          ElementOutput>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          cutlass::detail::TagToStrideC_t<LayoutC>,
+          cutlass::detail::TagToStrideC_t<LayoutD>,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+  using SmemCopyAtomA = Copy_Atom<UniversalCopy<ElementInputA>, ElementInputA>;
+  using SmemCopyAtomB = Copy_Atom<UniversalCopy<ElementInputB>, ElementInputB>;
+
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          GEMMDispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>,
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>,
+          TiledMma,
+          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int, int, int, int>,
+    CollectiveMainloop,
+    CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt
index f9c5fca18c..5736847e88 100644
--- a/examples/sycl/pvc/CMakeLists.txt
+++ b/examples/sycl/pvc/CMakeLists.txt
@@ -37,6 +37,11 @@ cutlass_example_add_executable(
   pvc_gemm_with_epilogue_relu.cpp
 )
 
+cutlass_example_add_executable(
+  pvc_gemm_with_epilogue_gelu
+  pvc_gemm_with_epilogue_gelu.cpp
+)
+
 cutlass_example_add_executable(
   pvc_collective_builder
   pvc_collective_builder.cpp
diff --git a/examples/sycl/pvc/common.h b/examples/sycl/pvc/common.hpp
similarity index 100%
rename from examples/sycl/pvc/common.h
rename to examples/sycl/pvc/common.hpp
diff --git a/examples/sycl/pvc/pvc_collective_builder.cpp b/examples/sycl/pvc/pvc_collective_builder.cpp
index 420fa4a8bc..6fbe7884a0 100644
--- a/examples/sycl/pvc/pvc_collective_builder.cpp
+++ b/examples/sycl/pvc/pvc_collective_builder.cpp
@@ -48,7 +48,7 @@
 #include "cutlass/tensor_view.h"
 #include "cutlass/coord.h"
 
-#include "common.h"
+#include "common.hpp"
 
 using namespace cute;
 
diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp
index c96fdff2a4..ee20a51eec 100644
--- a/examples/sycl/pvc/pvc_gemm.cpp
+++ b/examples/sycl/pvc/pvc_gemm.cpp
@@ -45,7 +45,7 @@
 #include "cutlass/util/packed_stride.hpp"
 #include "cutlass/util/reference/device/gemm_complex.h"
 #include "cutlass/util/reference/device/tensor_compare.h"
-#include "common.h"
+#include "common.hpp"
 
 using namespace cute;
 
diff --git a/examples/sycl/pvc/pvc_gemm_with_epilogue_gelu.cpp b/examples/sycl/pvc/pvc_gemm_with_epilogue_gelu.cpp
new file mode 100644
index 0000000000..9a0b814309
--- /dev/null
+++ b/examples/sycl/pvc/pvc_gemm_with_epilogue_gelu.cpp
@@ -0,0 +1,377 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_gelu.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/coord.h"
+
+#include "common.h"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l, iterations;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(5120), n(4096), k(4096), l(1), iterations(100),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 5120);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "PVC GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    syclcompat::wait();
+
+    using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
+    cutlass::reference::device::TensorGeLu(TensorView(block_ref_D.get(), LayoutD::packed({M, N}),
+                                                      cutlass::make_Coord(M, N)));
+
+    syclcompat::wait();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(
+      block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    gemm_op.can_implement(arguments);
+
+    gemm_op.initialize(arguments, workspace.get());
+
+    // Run the GEMM
+    gemm_op.run();
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (passed && options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        gemm_op.run();
+      }
+      syclcompat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using GmemTiledCopyA = XE_2D_U16x8x16_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x16x16_LD_V;
+
+  // Workgroup-level tile
+  using TileShape = Shape<_256, _128, _16>;
+
+  using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
+          Layout<Shape<_8,_2,_1>>,
+          Tile<_64,_32,_16>>; // Subgroup level-tile
+
+  constexpr int PipelineStages = 3;
+  using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC<PipelineStages>;
+  using EpilogueDispatchPolicy = cutlass::epilogue::IntelPVCEpilogue;
+
+  using EpilogueOp = cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::GELU, ElementOutput,
+          ElementComputeEpilogue, ElementAccumulator, ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks<EpilogueDispatchPolicy, EpilogueOp, TileShape,
+          decltype(tile_shape(TiledMma()))>;
+  using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
+          EpilogueDispatchPolicy,
+          TileShape,
+          ElementAccumulator,
+          cutlass::gemm::TagToStrideC_t<LayoutC>,
+          ElementOutput,
+          cutlass::gemm::TagToStrideC_t<LayoutD>,
+          FusionCallBacks,
+          XE_2D_U32x8x16_LD_N,
+          void, void,
+          XE_2D_U32x8x16_ST_N,
+          void, void>;
+
+// Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          GEMMDispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>,
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>,
+          TiledMma,
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>,
+  CollectiveMainloop,
+  CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp b/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp
index 47d538141d..459d9dccbc 100644
--- a/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp
+++ b/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp
@@ -49,7 +49,7 @@
 #include "cutlass/tensor_view.h"
 #include "cutlass/coord.h"
 
-#include "common.h"
+#include "common.hpp"
 
 using namespace cute;
 
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index b87d899a32..cceef6cda1 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -102,6 +102,10 @@ struct IntelPVC {
   static int const kMinComputeCapability = 0;
 };
 
+struct Agnostic {
+  static int const kMinComputeCapability = 1;
+};
+
 #endif
 
 /// Triggers a breakpoint on the device
diff --git a/include/cutlass/epilogue/collective/builders/device_agnostic_builder.inl b/include/cutlass/epilogue/collective/builders/device_agnostic_builder.inl
new file mode 100644
index 0000000000..da33c33186
--- /dev/null
+++ b/include/cutlass/epilogue/collective/builders/device_agnostic_builder.inl
@@ -0,0 +1,91 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/arch/arch.h>
+#include <cute/arch/copy.hpp>
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+
+namespace cutlass::epilogue::collective {
+
+template<
+  class TileShape_MNK,
+  class ElementAccumulator_,
+  class ElementCompute_,
+  class ElementC_,
+  class GmemLayoutTagC_,
+  int AlignmentC_,
+  class ElementD_,
+  class GmemLayoutTagD_,
+  int AlignmentD_,
+  class FusionOpOrCallbacks
+>
+struct CollectiveBuilder<
+  arch::Agnostic,
+  arch::OpMultiplyAdd,
+  TileShape_MNK,
+  Shape<_1, _1, _1>,
+  EpilogueTileAuto,
+  ElementAccumulator_,
+  ElementCompute_,
+  ElementC_,
+  GmemLayoutTagC_,
+  AlignmentC_,
+  ElementD_,
+  GmemLayoutTagD_,
+  AlignmentD_,
+  EpilogueScheduleAuto,
+  FusionOpOrCallbacks,
+  cute::enable_if_t<
+    (cute::is_same_v<FusionOpOrCallbacks,
+       cutlass::epilogue::fusion::LinearCombination<ElementD_, ElementCompute_, ElementC_, ElementCompute_>>)
+  >
+> {
+  using ElementD = ElementD_;
+  using ElementOutput = ElementD_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+
+  static constexpr int FragmentSize = 1;
+  using ThreadOp = thread::LinearCombination<
+    ElementD, FragmentSize, ElementAccumulator, ElementCompute>;
+
+  using CollectiveOp = cutlass::epilogue::collective::DefaultEpilogue<
+          cutlass::detail::TagToStrideC_t<GmemLayoutTagC_>,
+          cutlass::detail::TagToStrideC_t<GmemLayoutTagD_>,
+          ThreadOp,
+          cutlass::gemm::EpilogueDefault>;
+};
+
+} // namespace cutlass::epilogue::collective
diff --git a/include/cutlass/epilogue/collective/collective_builder.hpp b/include/cutlass/epilogue/collective/collective_builder.hpp
index 8ee169024a..c920295469 100644
--- a/include/cutlass/epilogue/collective/collective_builder.hpp
+++ b/include/cutlass/epilogue/collective/collective_builder.hpp
@@ -121,4 +121,8 @@ struct CallbacksBuilder<
 #if defined(SYCL_INTEL_TARGET)
 #include "builders/xe_builder.inl"
 #endif
+
+#if defined(CUTLASS_ENABLE_SYCL)
+#include "builders/device_agnostic_builder.inl"
+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index fa3873c5e7..c856afa76e 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -859,6 +859,8 @@ CUTLASS_HOST_DEVICE
 float fast_exp(float x) {
   #if defined(__CUDA_ARCH__)
   return ::expf(x);
+  #elif defined(__SYCL_CUDA_ARCH__)
+  return ::sycl::native::exp(x);
   #else
   return std::exp(x);
   #endif
diff --git a/include/cutlass/gemm/collective/builders/device_agnostic_mma_builder.inl b/include/cutlass/gemm/collective/builders/device_agnostic_mma_builder.inl
new file mode 100644
index 0000000000..013f994973
--- /dev/null
+++ b/include/cutlass/gemm/collective/builders/device_agnostic_mma_builder.inl
@@ -0,0 +1,128 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include <cutlass/arch/arch.h>
+#include <cutlass/gemm/dispatch_policy.hpp>
+
+#include "cutlass/gemm/collective/device_agnostic_mma.hpp"
+
+
+namespace cutlass::gemm::collective {
+
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class KernelScheduleType
+  > 
+struct CollectiveBuilder<
+  arch::Agnostic,
+  arch::OpMultiplyAdd,
+  ElementA,
+  GmemLayoutATag,
+  AlignmentA,
+  ElementB,
+  GmemLayoutBTag,
+  AlignmentB,
+  ElementAccumulator,
+  TileShape_MNK,
+  Shape<_1, _1, _1>,    // Cluster Shape
+  cutlass::gemm::collective::StageCountAuto, 
+  KernelScheduleType,
+  cute::enable_if_t<
+     cute::is_same_v<KernelScheduleType, KernelScheduleAuto>>
+>{
+#ifndef CUTLASS_ENABLE_SYCL
+  static_assert(cutlass::detail::dependent_false<arch::Agnostic>,
+    "Trying to use device Agnostic pipeline without SYCL enabled");
+#endif
+
+  using TiledMMA = TiledMMA<MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementAccumulator>>,
+                        Layout<Shape<_4, _4, _1>>>;
+
+  using DispatchPolicy = MainloopDeviceAgnostic;
+  using GmemTiledCopyA = decltype(
+        make_tiled_copy(Copy_Atom<UniversalCopy<ElementA>, ElementA>{},
+                        Layout<Shape<_4, _4>, Stride<_4, _1>>{},
+                        Layout<Shape<_1, _1>>{}
+        ));
+
+  using GmemTiledCopyB = decltype(
+        make_tiled_copy(Copy_Atom<UniversalCopy<ElementB>, ElementB>{},
+                        Layout<Shape<_4, _4>, Stride<_1, _4>>{},
+                        Layout<Shape<_1, _1>>{}
+        ));
+
+  using SmemCopyAtomA = Copy_Atom<UniversalCopy<ElementA>, ElementA>;
+  using SmemCopyAtomB = Copy_Atom<UniversalCopy<ElementB>, ElementB>;
+
+  //
+  using SmemLayoutAtomA = decltype(
+    make_layout(make_shape(get<0>(TileShape_MNK{}), get<2>(TileShape_MNK{})),
+                make_stride(_1{}, get<0>(TileShape_MNK{})))
+  );
+
+  using SmemLayoutAtomB = decltype(
+    make_layout(make_shape(get<1>(TileShape_MNK{}), get<2>(TileShape_MNK{})),
+                make_stride(_1{}, get<1>(TileShape_MNK{})))
+  );
+
+  using TransformA = cute::identity;
+  using TransformB = cute::identity;
+
+  using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
+    MainloopDeviceAgnostic,
+    TileShape_MNK,
+    ElementA,
+    cutlass::gemm::TagToStrideA_t<GmemLayoutATag>,
+    ElementB,
+    cutlass::gemm::TagToStrideB_t<GmemLayoutBTag>,
+    TiledMMA,
+    GmemTiledCopyA,
+    SmemLayoutAtomA,
+    SmemCopyAtomA,
+    TransformA,
+    GmemTiledCopyB,
+    SmemLayoutAtomB,
+    SmemCopyAtomB,
+    TransformB
+  >;
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/include/cutlass/gemm/collective/collective_builder.hpp b/include/cutlass/gemm/collective/collective_builder.hpp
index fa31aebaa1..f7fab76e1e 100644
--- a/include/cutlass/gemm/collective/collective_builder.hpp
+++ b/include/cutlass/gemm/collective/collective_builder.hpp
@@ -43,4 +43,8 @@
 #if defined(SYCL_INTEL_TARGET)
 #include "cutlass/gemm/collective/builders/xe_mma_builder.inl"
 #endif
+
+#if defined(CUTLASS_ENABLE_SYCL)
+#include "cutlass/gemm/collective/builders/device_agnostic_mma_builder.inl"
+#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp
index 9a1fc2c3d4..38f51d2c8f 100644
--- a/include/cutlass/gemm/collective/collective_mma.hpp
+++ b/include/cutlass/gemm/collective/collective_mma.hpp
@@ -50,4 +50,9 @@
 #if defined(SYCL_INTEL_TARGET)
 #include "cutlass/gemm/collective/xe_mma.hpp"
 #endif
+
+#if defined(CUTLASS_ENABLE_SYCL)
+#include "cutlass/gemm/collective/device_agnostic_mma.hpp"
+#endif
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/device_agnostic_mma.hpp b/include/cutlass/gemm/collective/device_agnostic_mma.hpp
new file mode 100644
index 0000000000..90157b24cf
--- /dev/null
+++ b/include/cutlass/gemm/collective/device_agnostic_mma.hpp
@@ -0,0 +1,195 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/collective/sm70_mma_twostage.hpp"
+
+namespace cutlass::gemm::collective {
+  using namespace cute;
+
+template <
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma <
+  MainloopDeviceAgnostic,
+  TileShape_,
+  ElementA_,
+  StrideA_,
+  ElementB_,
+  StrideB_,
+  TiledMma_,
+  GmemTiledCopyA_,
+  SmemLayoutAtomA_,
+  SmemCopyAtomA_,
+  TransformA_,
+  GmemTiledCopyB_,
+  SmemLayoutAtomB_,
+  SmemCopyAtomB_,
+  TransformB_
+> : 
+  CollectiveMma<
+    MainloopSm70TwoStage,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+  >
+
+ {
+    using DispatchPolicy = MainloopDeviceAgnostic;
+    using TileShape = TileShape_;
+    using ElementA = ElementA_;
+    using StrideA = StrideA_;
+    using ElementB = ElementB_;
+    using StrideB = StrideB_;
+    using TiledMma = TiledMma_;
+    using ElementAccumulator = typename TiledMma::ValTypeC;
+    using GmemTiledCopyA = GmemTiledCopyA_;
+    using GmemTiledCopyB = GmemTiledCopyB_;
+    using SmemLayoutAtomA = SmemLayoutAtomA_;
+    using SmemLayoutAtomB = SmemLayoutAtomB_;
+    using SmemCopyAtomA = SmemCopyAtomA_;
+    using SmemCopyAtomB = SmemCopyAtomB_;
+    using TransformA = TransformA_;
+    using TransformB = TransformB_;
+    using ArchTag = typename DispatchPolicy::ArchTag;
+
+    static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+    static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+    static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+    static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+    static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+    static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+    using SmemLayoutA = decltype(tile_to_shape(
+        SmemLayoutAtomA{},
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
+    using SmemLayoutB = decltype(tile_to_shape(
+        SmemLayoutAtomB{},
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
+
+    struct SharedStorage
+    {
+      cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+      cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+    };
+
+
+    struct Arguments {
+      ElementA const* ptr_A;
+      StrideA dA;
+      ElementB const* ptr_B;
+      StrideB dB;
+    };
+  
+    using Params = Arguments;
+
+    template <class ProblemShape>
+    static constexpr Params
+    to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+      (void) workspace;
+      return args;
+    }
+      
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+    {
+      // We can reuse the 2 stage blocking gemm in SM_70 predicated pipeline, giving a somewhat performant 
+      // device agnostic pipeline
+
+      CollectiveMma<
+        MainloopSm70TwoStage,
+        TileShape_,
+        ElementA_,
+        StrideA_,
+        ElementB_,
+        StrideB_,
+        TiledMma_,
+        GmemTiledCopyA_,
+        SmemLayoutAtomA_,
+        SmemCopyAtomA_,
+        TransformA_,
+        GmemTiledCopyB_,
+        SmemLayoutAtomB_,
+        SmemCopyAtomB_,
+        TransformB_
+      >::operator()(
+        accum,
+        gA,
+        gB,
+        src_accum, 
+        k_tile_iter, k_tile_count,
+        residue_mnk, thread_idx,
+        smem_buf
+      );
+    }
+  };
+}
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index a340833989..77cddf7dfb 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -263,11 +263,14 @@ struct CollectiveMma<
     // Mainloop
     //
     auto [m_idx, n_idx, k_idx, l_idx] = blk_coord;
+  #ifdef CUTLASS_SYCL_SWITCH_WG
+    const int m_coord = n_idx * BLK_M + (get_sub_group_id() / ATOM_N) * SG_M;
+    const int n_coord = m_idx * BLK_N + (get_sub_group_id() % ATOM_N) * SG_N;
+  #else
     const int m_coord = m_idx * BLK_M + (get_sub_group_id() / ATOM_N) * SG_M;
     const int n_coord = n_idx * BLK_N + (get_sub_group_id() % ATOM_N) * SG_N;
+  #endif
     const int l_coord = l_idx;
-
-    int sub_group_id = get_sub_group_id();
     Tensor iter_a = mainloop.gmem_tiled_copy_a.get_pvc_tensor(
       make_coord(m_coord, 0, l_coord), append<4>(tCrA_copy_view.shape(), k_tile_count),
       append<3>(typename XE_Copy_A::Shape_MN{}, BLK_K), seq<0,1,1>{});
@@ -279,13 +282,13 @@ struct CollectiveMma<
     int prefetch_k = 0;
 
     Tensor prefetch_iter_a = mainloop.gmem_prefetch_a.get_pvc_tensor(
-      make_coord(m_coord + (sub_group_id % ATOM_N) / get<1>(PrefetchAThrShape{}) * get<0>(PrefetchATileSize{}),
-               (k_start_idx + (sub_group_id % ATOM_N) % get<1>(PrefetchAThrShape{})) * PrefetchStrideA, l_coord),
+      make_coord(m_coord + (get_sub_group_id() % ATOM_N) / get<1>(PrefetchAThrShape{}) * get<0>(PrefetchATileSize{}),
+               (k_start_idx + (get_sub_group_id() % ATOM_N) % get<1>(PrefetchAThrShape{})) * PrefetchStrideA, l_coord),
       append<4>(make_shape(_1{}, _1{}, _1{}), k_tile_count),
       append<3>(make_shape(SG_M, SG_K), BLK_K), seq<0, 1, 1>{});
     Tensor prefetch_iter_b = mainloop.gmem_prefetch_b.get_pvc_tensor(
-      make_coord(((sub_group_id / ATOM_N) / get<1>(PrefetchBThrShape{}) + k_start_idx) * PrefetchStrideB,
-                n_coord + (sub_group_id / ATOM_N) % get<1>(PrefetchBThrShape{}) * get<1>(PrefetchBTileSize{}), l_coord),
+      make_coord(((get_sub_group_id() / ATOM_N) / get<1>(PrefetchBThrShape{}) + k_start_idx) * PrefetchStrideB,
+                n_coord + (get_sub_group_id() / ATOM_N) % get<1>(PrefetchBThrShape{}) * get<1>(PrefetchBTileSize{}), l_coord),
       append<4>(make_shape(_1{}, _1{}, _1{}), k_tile_count),
       append<3>(make_shape(SG_K, SG_N), BLK_K), seq<0,1,0>{});
 
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 8c9d37e573..093eb20df6 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -456,16 +456,24 @@ class GemmUniversalAdapter<
 
         using namespace syclcompat::experimental;
 #if defined (SYCL_INTEL_TARGET)
-        auto event = launch<device_kernel<GemmKernel>>(launch_policy{
-          sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
-          kernel_properties{sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>}
-        }, params);
+        if constexpr (cute::is_same_v<DispatchPolicy, MainloopDeviceAgnostic>) {
+          auto event = launch<device_kernel<GemmKernel>>(launch_policy{
+            sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}
+          }, params);
+          EventManager::getInstance().addEvent(event);
+        } else {
+          auto event = launch<device_kernel<GemmKernel>>(launch_policy{
+            sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
+            kernel_properties{sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>}
+          }, params);
+          EventManager::getInstance().addEvent(event);
+        }
 #else
         auto event = launch<device_kernel<GemmKernel>>(launch_policy{
           sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}},
           params);
-#endif
         EventManager::getInstance().addEvent(event);
+#endif
 #else
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
         CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with cutlass::kernel_launch");
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index acc0961d64..4d9cee37b2 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -332,6 +332,14 @@ struct MainloopIntelPVC {
 };
 #endif
 
+#if defined(CUTLASS_ENABLE_SYCL)
+struct MainloopDeviceAgnostic {
+  using ArchTag = arch::Agnostic;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using Schedule = KernelMultistage;
+};
+#endif
+
 //////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::gemm
diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp
index 2b41f3d31f..d7b49666fc 100644
--- a/include/cutlass/gemm/kernel/xe_gemm.hpp
+++ b/include/cutlass/gemm/kernel/xe_gemm.hpp
@@ -186,8 +186,13 @@ class GemmUniversal<
       batch_count = cute::size<3>(params.problem_shape);
     }
     return dim3(
+        #ifdef CUTLASS_SYCL_SWITCH_WG
+            cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(WorkgroupTileShape{}))),
+            cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(WorkgroupTileShape{}))),
+        #else
             cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(WorkgroupTileShape{}))),
             cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(WorkgroupTileShape{}))),
+        #endif
             batch_count
     );
   }
@@ -221,8 +226,13 @@ class GemmUniversal<
     // Get the appropriate blocks for this sub_group -- potential for sub_group locality
     int thread_idx = int(ThreadIdxX());
     auto blk_shape = TileShape{};
+    #ifdef CUTLASS_SYCL_SWITCH_WG
+    auto m_coord = BlockIdxX();
+    auto n_coord = BlockIdxY();
+    #else
     auto m_coord = BlockIdxY();
     auto n_coord = BlockIdxX();
+    #endif
     auto l_coord = BlockIdxZ();
     auto blk_coord_mnkl = make_coord(m_coord, n_coord, _, l_coord);  
     int sub_group_id = thread_idx / SubgroupSize;
diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h
index 7f5a69537f..118746b963 100644
--- a/include/cutlass/gpu_generics.h
+++ b/include/cutlass/gpu_generics.h
@@ -49,6 +49,7 @@ static const int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWa
 static const int NumThreadsPerHalfWarp = NumThreadsPerWarp / 2;
 static const int NumThreadsPerQuad = 4;
 static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2;
+static constexpr int MaxNumThreadsPerBlock = 1024;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_gelu.h b/tools/util/include/cutlass/util/reference/device/tensor_gelu.h
new file mode 100644
index 0000000000..30f452a5cd
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/device/tensor_gelu.h
@@ -0,0 +1,148 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
+    in this header are not specialized for any particular data layout and are therefore not
+    intended to offer the best possible performance. Rather, they are intended to be generic
+    reference implementations to support the CUTLASS unit tests.
+*/
+
+#pragma once
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_view.h"
+
+#include "cutlass/util/reference/device/tensor_foreach.h"
+#include "cutlass/util/reference/device/tensor_relu.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorGeLuFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Coordinate in tensor's index space
+  using TensorCoord = typename TensorView::TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+
+    //
+    // Methods
+    //
+
+    Params(
+      TensorView view_ = TensorView()
+    ):
+      view(view_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TensorGeLuFunc(Params const &params): params(params) {
+
+  }
+
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    Element const & value = params.view.at(coord);
+    
+    params.view.at(coord) = Element(cutlass::constants::half<Element>() * value *
+      (cutlass::constants::one<Element>() + (Element)erff((float)(value * cutlass::constants::half_root_two<Element>()))));
+  }
+};
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Apply GeLu on a tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorGeLu(
+  TensorView<Element, Layout> view) {       ///< destination tensor
+  
+  using Func = detail::TensorGeLuFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view)
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+
+#if (CUTLASS_ENABLE_SYCL)
+namespace sycl {
+  template <>
+  struct is_device_copyable <
+  cutlass::reference::device::detail::TensorGeLuFunc<float,
+                                                      cutlass::layout::RowMajor>::Params> : std::true_type {};
+}
+#endif