diff --git a/CMakeLists.txt b/CMakeLists.txt
index d442254..8faee83 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,12 +171,13 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
   set(CUTLASS_ENABLE_HEADERS_ONLY "ON" CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "main" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "9baca2cff3a28590fcd03e55515e2d91ff2cbc8b" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   FetchContent_Declare(
       cutlass-sycl
       GIT_REPOSITORY https://github.com/intel/cutlass-sycl
+
       # Please keep this in sync with CUTLASS_REVISION line above.
       GIT_TAG ${CUTLASS_REVISION}
       GIT_PROGRESS TRUE
@@ -184,7 +185,7 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
       # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
       # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
       # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-      GIT_SHALLOW TRUE
+      GIT_SHALLOW FALSE
   )
 
   # cutlass compilation flags
@@ -196,7 +197,6 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
   set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT OFF CACHE BOOL "DISABLE CUDA")
   # list(APPEND CMAKE_CXX_FLAGS "-ftemplate-backtrace-limit=0 " )
   # list(APPEND CMAKE_CXX_FLAGS "-fdiagnostics-color=always " )
-  
 
   FetchContent_MakeAvailable(cutlass-sycl)
   set(CUTLASS_INCLUDE_DIR ${cutlass-sycl_SOURCE_DIR}/include CACHE PATH "CUTLASS Header Library")
@@ -269,11 +269,15 @@ endif ()
 #
 # xpu only ops/kernels, implemented with cutlass/onednn/sycl.
 #
+file(GLOB CUTLASS_BACKEND_SRCS
+  csrc/xpu/cutlass_kernels/*.cpp
+)
 if(VLLM_GPU_LANG STREQUAL "SYCL")
   set(VLLM_EXT_XPU_SRC
     "csrc/xpu/torch_bindings.cpp"
     "csrc/xpu/lora/lora_shrink.cpp"
     "csrc/xpu/lora/lora_expand.cpp"
+    ${CUTLASS_BACKEND_SRCS}
   )
   include_directories("/usr/include")
   set(CMPLR_ROOT $ENV{CMPLR_ROOT})
@@ -282,6 +286,12 @@ if(VLLM_GPU_LANG STREQUAL "SYCL")
   list(APPEND VLLM_GPU_FLAGS "-DVLLM_BUILD_XPU_OPS" )
   list(APPEND VLLM_GPU_LINK_FLAGS "-fsycl" "-fsycl-targets=spir64")
   list(APPEND VLLM_LINK_LIBRARIES "sycl" "OpenCL" "pthread" "m" "dl" "torch" )
+  # CUTLASS FLAGS
+  list(APPEND VLLM_GPU_FLAGS "-O3" "-DNDEBUG")
+  list(APPEND VLLM_GPU_FLAGS "-gline-tables-only")
+  list(APPEND VLLM_GPU_FLAGS "-fsycl" "-fsycl-targets=spir64_gen" "-ftemplate-backtrace-limit=10")
+  list(APPEND VLLM_GPU_LINK_FLAGS "-fsycl" "-fsycl-targets=spir64_gen")
+  list(APPEND VLLM_GPU_LINK_FLAGS -Xsycl-target-backend=spir64_gen "-device bmg-g21-a0 -internal_options -cl-intel-256-GRF-per-thread")
 endif()
 
 if(ONEDNN_FOUND)
@@ -305,6 +315,8 @@ define_gpu_extension_target(
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
   INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_APP_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${VLLM_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
diff --git a/csrc/core/registration.h b/csrc/core/registration.h
index 9dbf34b..576b5e1 100644
--- a/csrc/core/registration.h
+++ b/csrc/core/registration.h
@@ -1,5 +1,4 @@
 #pragma once
-
 #include <Python.h>
 
 #define _CONCAT(A, B) A##B
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/default_gemm_universal.h b/csrc/xpu/cutlass_kernels/collective/gemm/default_gemm_universal.h
new file mode 100644
index 0000000..f2743bf
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/default_gemm_universal.h
@@ -0,0 +1,306 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix
+   multiply-add with the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major
+   outputs are accommodated by exchanging A and B operands and assuming
+   transposed layouts. Partial specializations here choose
+   'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "gemm_universal_k.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute,
+    ///
+    typename Enable = void>
+struct DefaultGemmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout>
+struct DefaultGemmUniversal<
+    ElementA, LayoutA,
+    ComplexTransform::kNone,  // transform A
+    kAlignmentA, ElementB, LayoutB,
+    ComplexTransform::kNone,  // transform B
+    kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag,
+    ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp,
+    ThreadblockSwizzle, Stages, Operator, SharedMemoryClear, GatherA, GatherB,
+    ScatterD, PermuteDLayout, PermuteALayout, PermuteBLayout,
+    typename platform::enable_if<
+        !cutlass::is_complex<ElementAccumulator>::value>::type> {
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+      LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
+      WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+      true, Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
+      PermuteDLayout, PermuteALayout, PermuteBLayout>::GemmKernel;
+
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase
+      : public kernel::GemmUniversal<typename DefaultGemmKernel::Mma,
+                                     typename DefaultGemmKernel::Epilogue,
+                                     SwizzleT> {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature>
+      : public kernel::GemmUniversalStreamk<
+            typename DefaultGemmKernel::Mma,
+            typename DefaultGemmKernel::Epilogue, SwizzleT> {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultGemmUniversal<
+    ElementA, LayoutA, TransformA, kAlignmentA, ElementB, LayoutB, TransformB,
+    kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag,
+    ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp,
+    ThreadblockSwizzle, Stages, Operator, SharedMemoryClear, false, false,
+    false, layout::NoPermute, layout::NoPermute, layout::NoPermute,
+    typename platform::enable_if<
+        cutlass::is_complex<ElementAccumulator>::value>::type> {
+  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+      InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+      TransformA, TransformB, Operator, false>::GemmKernel;
+
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase
+      : public kernel::GemmUniversal<typename DefaultGemmKernel::Mma,
+                                     typename DefaultGemmKernel::Epilogue,
+                                     SwizzleT> {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature>
+      : public kernel::GemmUniversalStreamk<
+            typename DefaultGemmKernel::Mma,
+            typename DefaultGemmKernel::Epilogue, SwizzleT> {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.h
new file mode 100644
index 0000000..411f673
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.h
@@ -0,0 +1,366 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "gemm_universal_k.h"
+
+#include "default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  GemmUniversal is a stateful, reusable GEMM handle.  Once initialized for a
+  given GEMM computation (problem geometry and data references), it can be
+  reused across different GEMM problems having the geometry.  (Once initialized,
+  details regarding problem geometry and references to workspace memory cannot
+  be updated.)
+
+  The universal GEMM accommodates serial reductions, parallel reductions,
+  batched strided, and batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout_ = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute>
+class GemmUniversal
+    : public GemmUniversalBase<typename kernel::DefaultGemmUniversal<
+          ElementA_, LayoutA_, TransformA, AlignmentA, ElementB_, LayoutB_,
+          TransformB, AlignmentB, ElementC_, LayoutC_, ElementAccumulator_,
+          OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_,
+          InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages,
+          Operator_, SharedMemoryClearOption::kNone, GatherA, GatherB, ScatterD,
+          PermuteDLayout_, PermuteALayout_, PermuteBLayout_>::GemmKernel> {
+ public:
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<typename kernel::DefaultGemmUniversal<
+      ElementA_, LayoutA_, TransformA, AlignmentA, ElementB_, LayoutB_,
+      TransformB, AlignmentB, ElementC_, LayoutC_, ElementAccumulator_,
+      OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_,
+      InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages,
+      Operator_, SharedMemoryClearOption::kNone, GatherA, GatherB, ScatterD,
+      PermuteDLayout_, PermuteALayout_, PermuteBLayout_>::GemmKernel>;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and
+/// operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout_,
+    /// Permute operand A
+    typename PermuteALayout_,
+    /// Permute operand B
+    typename PermuteBLayout_>
+class GemmUniversal<
+    ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+    layout::ColumnMajor,  // partially specialized on LayoutC
+    ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+    WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_,
+    Stages, AlignmentA, AlignmentB, Operator_, TransformA, TransformB, GatherA,
+    GatherB, ScatterD, PermuteDLayout_, PermuteALayout_, PermuteBLayout_> {
+ public:
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversal<
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, ElementA,
+      typename layout::LayoutTranspose<LayoutA>::type, ElementC,
+      layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag,
+      ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp,
+      ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, Operator,
+      kTransformB, kTransformA, GatherB, GatherA, ScatterD, PermuteDLayout,
+      PermuteBLayout, PermuteALayout>::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+ private:
+  UnderlyingOperator underlying_operator_;
+
+ public:
+  /// Constructs the GEMM.
+  GemmUniversal() {}
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM
+  /// operator
+  static Arguments to_underlying_arguments(Arguments const& args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    return UnderlyingOperator::get_workspace_size(
+        to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr) {
+    return underlying_operator_.initialize(to_underlying_arguments(args),
+                                           workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    return underlying_operator_.update(to_underlying_arguments(args),
+                                       workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.hpp
new file mode 100644
index 0000000..3b59cc8
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal.hpp
@@ -0,0 +1,57 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+// In cases where ProblemShape is not a tuple, this is used to check if the
+// underlying problem shape type is aliased within or not.
+// Used for dispatching GemmUniversal to 2.x API or 3.x API
+template <class ProblemShape, class = void>
+struct IsCutlass3ArrayKernel : cute::false_type {};
+
+template <typename ProblemShape>
+struct IsCutlass3ArrayKernel<
+    ProblemShape, cute::void_t<typename ProblemShape::UnderlyingProblemShape>>
+    : cute::true_type {};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::kernel
+
+////////////////////////////////////////////////////////////////////////////////
+#include "xe_gemm_array_cooperative.hpp"
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_adapter.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_adapter.h
new file mode 100644
index 0000000..0c923e8
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_adapter.h
@@ -0,0 +1,844 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions,
+  batched strided, and batched array variants.
+*/
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+  #include "cutlass/cluster_launch.hpp"
+  #include "cutlass/trace.h"
+#endif  // !defined(__CUDACC_RTC__)
+
+// 2.x
+#include "gemm_universal_base.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+// 3.x
+#include "gemm_universal.hpp"
+
+#if defined(CUTLASS_ENABLE_SYCL)
+  #include "cutlass/util/sycl_event_manager.hpp"
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel
+  of type cutlass::gemm::kernel::Gemm or cutlass::gemm::kernel::GemmUniversal.
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes
+  APIs to create it from the host facing arguments. For power users, new static
+  methods are exposed in 3.x APIs that bypass the stateful methods or
+  args->params lowering.
+
+  It supports kernel types that implement both the 2.x and 3.0 APIs,
+  however, this is done by specializing the implementation of
+  GemmUniversalAdapter on the two kernel API types, and thus,
+  GemmUniversalAdapter's behaviour might differ between the two specializations.
+*/
+template <class GemmKernel_, class Enable = void>
+class GemmUniversalAdapter;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Work-around for some DispatchPolicy types not having a Stages member.
+// In that case, the Stages value is 0.  Most code should static_assert
+// that the number of stages is valid.
+
+// Whether DispatchPolicy::Stages is valid.
+// It should also be convertible to int, but if not, that will show up
+// as a build error when GemmUniversalAdapter attempts to assign it to kStages.
+template <class DispatchPolicy, class Enable = void>
+struct has_Stages : cute::false_type {};
+
+template <class DispatchPolicy>
+struct has_Stages<DispatchPolicy,
+                  cute::void_t<decltype(DispatchPolicy::Stages)>>
+    : cute::true_type {};
+
+template <class DispatchPolicy>
+constexpr int stages_member(DispatchPolicy) {
+  if constexpr (has_Stages<DispatchPolicy>::value) {
+    return DispatchPolicy::Stages;
+  } else {
+    return 0;
+  }
+}
+
+}  // namespace detail
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<GemmKernel_,
+                           cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<
+                               GetUnderlyingKernel_t<GemmKernel_>>::value>> {
+ public:
+  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
+  using TileShape = typename GemmKernel::TileShape;
+  using ElementA = typename GemmKernel::ElementA;
+  using ElementB = typename GemmKernel::ElementB;
+  using ElementC = typename GemmKernel::ElementC;
+  using ElementD = typename GemmKernel::ElementD;
+  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
+  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
+  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
+
+  // Map back to 2.x type as best as possible
+  using LayoutA =
+      gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
+  using LayoutB =
+      gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
+  using LayoutC =
+      gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
+  using LayoutD =
+      gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  static ComplexTransform const kTransformA =
+      cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformA,
+                      cute::conjugate>
+          ? ComplexTransform::kConjugate
+          : ComplexTransform::kNone;
+  static ComplexTransform const kTransformB =
+      cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformB,
+                      cute::conjugate>
+          ? ComplexTransform::kConjugate
+          : ComplexTransform::kNone;
+
+  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
+  using MathOperator = cutlass::arch::OpMultiplyAdd;
+
+  using OperatorClass = cutlass::detail::get_operator_class_t<
+      typename CollectiveMainloop::TiledMma>;
+
+  using ArchTag = typename GemmKernel::ArchTag;
+
+  // NOTE: Assume identity swizzle for now
+  using ThreadblockSwizzle =
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<cute::size<0>(TileShape{}),
+                                                    cute::size<1>(TileShape{}),
+                                                    cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our
+  // TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and
+  // rest along N, none along K We also always round up the warp count to 4 if
+  // the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(
+      4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape =
+      cutlass::gemm::GemmShape<CUTE_STATIC_V(cute::tile_size<0>(
+                                   typename CollectiveMainloop::TiledMma{})) /
+                                   WarpsInMmaM,
+                               CUTE_STATIC_V(cute::tile_size<1>(
+                                   typename CollectiveMainloop::TiledMma{})) /
+                                   WarpsInMmaN,
+                               CUTE_STATIC_V(cute::tile_size<2>(
+                                   typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages =
+      detail::stages_member(typename CollectiveMainloop::DispatchPolicy{});
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA =
+      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+          typename CollectiveMainloop::GmemTiledCopyA, ElementA,
+          typename CollectiveMainloop::TiledMma::ValTypeA>();
+  static int constexpr kAlignmentB =
+      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+          typename CollectiveMainloop::GmemTiledCopyB, ElementB,
+          typename CollectiveMainloop::TiledMma::ValTypeB>();
+  static int constexpr kAlignmentC =
+      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+          typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD =
+      cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+          typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  // Split-K preserves splits that are 128b aligned
+  static int constexpr kSplitKAlignment = cute::max(
+      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  /// Argument structure: User API
+  using Arguments = typename GemmKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename GemmKernel::Params;
+
+ private:
+  /// Kernel API parameters object
+  Params params_;
+
+ public:
+  /// Access the Params structure
+  Params const& params() const { return params_; }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+    if (GemmKernel::can_implement(args)) {
+      return Status::kSuccess;
+    } else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) *
+                         size_t(cute::size<1>(TileShape{}));
+    }
+
+    workspace_bytes += GemmKernel::get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = GemmKernel::to_underlying_arguments(args, workspace);
+    return GemmKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Params const& params) {
+    return GemmKernel::get_grid_shape(params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(device_kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError();  // to clear the error bit
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: "
+                           << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, device_kernel<GemmKernel>,
+        GemmKernel::MaxThreadsPerBlock, smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError();  // to clear the error bit
+      CUTLASS_TRACE_HOST(
+          "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+          << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr,
+                    CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace "
+                       << workspace
+                       << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status =
+        GemmKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set
+    // it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    } else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = GemmKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+#if !defined(CUTLASS_ENABLE_SYCL)
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<GemmKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError();  // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: "
+                             << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+#endif
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight
+  /// update of params.
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and
+  /// manage their own params. Supplied params struct must be construct by
+  /// calling GemmKernel::to_underlying_arguments()
+  static Status run(Params& params, sycl::queue& stream,
+                    CudaHostAdapter* cuda_adapter = nullptr,
+                    bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("GemmUniversal::run()");
+    dim3 const block = GemmKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+#if defined(CUTLASS_ENABLE_SYCL)
+    const syclcompat::dim3 sycl_block(block.x, block.y, block.z);
+    const syclcompat::dim3 sycl_grid(grid.x, grid.y, grid.z);
+#endif
+
+    // configure smem size and carveout
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    Status launch_result{Status::kSuccess};
+    // Use extended launch API only for mainloops that use it
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability >= 90) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API");
+#endif
+#if !defined(CUTLASS_ENABLE_SYCL)
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+          cute::is_static_v<
+              typename GemmKernel::DispatchPolicy::ClusterShape> and
+          cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
+      [[maybe_unused]] dim3 cluster(
+          cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+          cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+          cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
+
+      // Dynamic cluster support
+      [[maybe_unused]] dim3 fallback_cluster = dim3{0, 0, 0};
+      if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 ||
+                    GemmKernel::ArchTag::kMinComputeCapability == 101) {
+        if constexpr (!cute::is_static_v<
+                          typename GemmKernel::DispatchPolicy::ClusterShape>) {
+          fallback_cluster = params.hw_info.cluster_shape_fallback;
+          cluster = params.hw_info.cluster_shape;
+        }
+      }
+
+      [[maybe_unused]] void* kernel_params[] = {&params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+                "GemmUniversal::run() does not support launching with PDL and "
+                "a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST(
+              "GemmUniversal::run: Launching kernel with CUDA host adapter");
+  #endif
+          if constexpr (is_static_1x1x1) {
+            launch_result = cuda_adapter->launch(grid, block, smem_size, stream,
+                                                 kernel_params, 0);
+          } else {
+            launch_result =
+                cuda_adapter->launch(grid, cluster, fallback_cluster, block,
+                                     smem_size, stream, kernel_params, 0);
+          }
+        } else {
+          CUTLASS_TRACE_HOST(
+              "GemmUniversal::run: kEnableCudaHostAdapter is true, but CUDA "
+              "host adapter is null");
+          return Status::kErrorInternal;
+        }
+      } else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        [[maybe_unused]] void const* kernel =
+            (void const*)device_kernel<GemmKernel>;
+        static constexpr bool kClusterLaunch =
+            GemmKernel::ArchTag::kMinComputeCapability == 90;
+        if constexpr (kClusterLaunch) {
+          if constexpr (is_static_1x1x1) {
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST(
+                "GemmUniversal::run: Launching static 1x1x1 kernel");
+  #endif
+            launch_result = cutlass::kernel_launch<GemmKernel>(
+                grid, block, smem_size, stream, params, launch_with_pdl);
+            if (launch_result != Status::kSuccess) {
+              CUTLASS_TRACE_HOST(
+                  "GemmUniversal::run: cutlass::kernel_launch reports failure");
+            }
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            else {
+              CUTLASS_TRACE_HOST(
+                  "GemmUniversal::run: cutlass::kernel_launch reports success");
+            }
+  #endif
+          } else {
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST(
+                "GemmUniversal::run: Launching dynamic cluster kernel");
+  #endif
+            launch_result =
+                ClusterLauncher::launch(grid, cluster, block, smem_size, stream,
+                                        kernel, kernel_params, launch_with_pdl);
+          }
+        }
+
+        else {
+          if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 ||
+                        GemmKernel::ArchTag::kMinComputeCapability == 101 ||
+                        GemmKernel::ArchTag::kMinComputeCapability == 120) {
+            if constexpr (is_static_1x1x1) {
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+              CUTLASS_TRACE_HOST(
+                  "GemmUniversal::run: Launching static 1x1x1 kernel");
+  #endif
+              launch_result = cutlass::kernel_launch<GemmKernel>(
+                  grid, block, smem_size, stream, params, launch_with_pdl);
+              if (launch_result != Status::kSuccess) {
+                CUTLASS_TRACE_HOST(
+                    "GemmUniversal::run: cutlass::kernel_launch reports "
+                    "failure");
+              }
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+              else {
+                CUTLASS_TRACE_HOST(
+                    "GemmUniversal::run: cutlass::kernel_launch reports "
+                    "success");
+              }
+  #endif
+            } else {
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+              CUTLASS_TRACE_HOST(
+                  "GemmUniversal::run: Launching kernel with fall-back "
+                  "cluster");
+  #endif
+              launch_result = ClusterLauncher::launch_with_fallback_cluster(
+                  grid, cluster, fallback_cluster, block, smem_size, stream,
+                  kernel, kernel_params, launch_with_pdl);
+            }
+          }
+        }
+      }
+#endif
+    } else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST(
+              "GemmUniversal::run: Launching kernel with CUDA host adapter");
+#endif
+          launch_result = cuda_adapter->launch(grid, block, smem_size, stream,
+                                               kernel_params, 0);
+
+        } else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: CUDA host adapter is null");
+          return Status::kErrorInternal;
+        }
+      } else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+#if defined(CUTLASS_ENABLE_SYCL)
+        // sycl::queue q = stream; // ? *stream :
+        // syclcompat::get_default_queue();
+  #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
+        using namespace syclcompat::experimental;
+        if constexpr (cute::is_same_v<DispatchPolicy, MainloopDeviceAgnostic>) {
+          auto event = launch<device_kernel<GemmKernel>>(
+              launch_policy{sycl_grid, sycl_block,
+                            local_mem_size {
+                              static_cast<std::size_t>(smem_size)
+                            }},
+              q, params);
+          EventManager::getInstance().addEvent(event);
+        } else {
+          auto event = launch<device_kernel<GemmKernel>>(
+              launch_policy{
+                  sycl_grid, sycl_block,
+                  local_mem_size{static_cast<std::size_t>(smem_size)}
+    #if defined(SYCL_INTEL_TARGET)
+                  ,
+                  kernel_properties {
+                    sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>
+                  }
+    #endif
+              },
+              stream, params);
+          EventManager::getInstance().addEvent(event);
+        }
+  #else
+    #if defined(SYCL_INTEL_TARGET)
+        constexpr bool allow_subgroup_size_prop = true;
+    #else
+        constexpr bool allow_subgroup_size_prop = false;
+    #endif
+        auto kernel_props = [] {
+          constexpr bool is_device_agnostic =
+              cute::is_same_v<DispatchPolicy, MainloopDeviceAgnostic>;
+          if constexpr (!allow_subgroup_size_prop or is_device_agnostic) {
+            using EmptyProperties =
+                decltype(sycl::ext::oneapi::experimental::properties());
+            return syclcompat::experimental::kernel_properties<
+                EmptyProperties>{};
+          } else {
+            return syclcompat::experimental::kernel_properties{
+                sycl::ext::oneapi::experimental::sub_group_size<
+                    DispatchPolicy::SubgroupSize>};
+          }
+        }();
+        syclcompat::experimental::launch_properties launch_props{
+            sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
+        };
+        syclcompat::experimental::launch_policy policy{
+            sycl_grid, sycl_block, launch_props, kernel_props};
+        auto event =
+            syclcompat::experimental::launch<device_kernel<GemmKernel>>(
+                policy, stream, params);
+        EventManager::getInstance().addEvent(event);
+  #endif  // !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
+#else
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST(
+            "GemmUniversal::run: Launching kernel with cutlass::kernel_launch");
+  #endif
+        launch_result = cutlass::kernel_launch<GemmKernel>(
+            grid, block, smem_size, stream, params, launch_with_pdl);
+        if (launch_result != Status::kSuccess) {
+          CUTLASS_TRACE_HOST(
+              "GemmUniversal::run: cutlass::kernel_launch reports failure");
+        }
+  #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        else {
+          CUTLASS_TRACE_HOST(
+              "GemmUniversal::run: cutlass::kernel_launch reports success");
+        }
+  #endif
+#endif
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST(
+          "GemmUniversal::run: cudaGetLastError reports success");
+#endif
+      return Status::kSuccess;
+    } else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params
+  // struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from
+  /// supplied arguments.
+  Status run(Arguments const& args, void* workspace, sycl::queue& stream,
+             CudaHostAdapter* cuda_adapter = nullptr,
+             bool launch_with_pdl = false) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from
+  /// supplied arguments.
+  Status operator()(Arguments const& args, void* workspace, sycl::queue& stream,
+                    CudaHostAdapter* cuda_adapter = nullptr,
+                    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating
+  /// internal params struct.
+  Status run(sycl::queue& stream, CudaHostAdapter* cuda_adapter = nullptr,
+             bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating
+  /// internal params struct.
+  Status operator()(sycl::queue& stream,
+                    CudaHostAdapter* cuda_adapter = nullptr,
+                    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 2.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<
+    GemmKernel_, cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<
+                     GetUnderlyingKernel_t<GemmKernel_>>::value>> {
+ public:
+  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
+
+  static bool const kInternalTranspose =
+      !cutlass::epilogue::threadblock::detail::is_2x_evt_v<
+          typename GemmKernel::Epilogue> &&  // 2.x EVT does not require
+                                             // internal transpose
+      cute::is_same<typename GemmKernel::LayoutC,
+                    cutlass::layout::RowMajor>::value;
+
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+  using WarpShape = typename GemmKernel::WarpShape;
+  using InstructionShape = typename GemmKernel::InstructionShape;
+
+  // warp-level, arch-level (instruction), math operator
+  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+
+  // Operator class and arch tag extract bottom-up
+  // set it for top-level gemm device-level template
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  // Type, layout, and complex transform deliberately exchanged with B
+  using MapArguments = kernel::detail::MapArguments<
+      typename GemmKernel::ElementA, typename GemmKernel::LayoutA,
+      GemmKernel::kTransformA, GemmKernel::kAlignmentA,
+      typename GemmKernel::ElementB, typename GemmKernel::LayoutB,
+      GemmKernel::kTransformB, GemmKernel::kAlignmentB,
+      typename GemmKernel::LayoutC, kInternalTranspose>;
+
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename MapArguments::LayoutC;
+  static int const kAlignmentC = GemmKernel::kAlignmentC;
+
+  // C and D same type for 2.x kernel
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementD, LayoutD>;
+
+  static int const kStages = GemmKernel::Mma::kStages;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using UnderlyingOperator = GemmUniversalBase<GemmKernel>;
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+ private:
+  UnderlyingOperator underlying_operator_;
+
+ public:
+  /// Constructs the GEMM.
+  GemmUniversalAdapter() {}
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM
+  /// operator
+  static Arguments to_underlying_arguments(Arguments const& args) {
+    if (kInternalTranspose) {
+      return args.transposed_problem();
+    } else {
+      return args;
+    }
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args,
+                              CudaHostAdapter* cuda_adapter = nullptr) {
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args),
+                                             cuda_adapter);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args,
+                                   CudaHostAdapter* cuda_adapter = nullptr) {
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args),
+                                                  cuda_adapter);
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr,
+                    CudaHostAdapter* cuda_adapter = nullptr) {
+    return underlying_operator_.initialize(to_underlying_arguments(args),
+                                           workspace, stream, cuda_adapter);
+  }
+
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const& args) {
+    return underlying_operator_.update(to_underlying_arguments(args));
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr,
+             CudaHostAdapter* cuda_adapter = nullptr) {
+    return underlying_operator_.run(stream, cuda_adapter);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr,
+                    CudaHostAdapter* cuda_adapter = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr,
+                    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_base.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_base.h
new file mode 100644
index 0000000..b909318
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_base.h
@@ -0,0 +1,524 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates streamk, batched strided, and batched
+  array variants.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+  #include <cuda/std/limits>
+#else
+  #include <limits>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/gemm/gemm.h"
+#include "gemm_universal_k.h"
+
+#include "default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemmKernel_>
+class GemmUniversalBase {
+ public:
+  using GemmKernel = GemmKernel_;
+
+  /// Boolean indicating whether the CudaHostAdapter is enabled
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+  using ElementA = typename GemmKernel::ElementA;
+  using LayoutA = typename GemmKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+  using ElementB = typename GemmKernel::ElementB;
+  using LayoutB = typename GemmKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename GemmKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename GemmKernel::Mma::ElementC;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using Operator = typename GemmKernel::Operator;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::Arguments;
+
+  /// Index of the GEMM Kernel within the CudaHostAdapter
+  static int32_t const kGemmKernelIndex = 0;
+
+  /// Kernel dynamic shared memory allocation requirement
+  /// Update the kernel function's shared memory configuration for the current
+  /// device
+  static constexpr size_t kSharedStorageSize =
+      sizeof(typename GemmKernel::SharedStorage);
+
+ protected:
+  //
+  // Device properties (uniform across all instances of the current thread)
+  //
+
+  // Device ordinal
+  CUTLASS_THREAD_LOCAL static int device_ordinal_;
+
+  /// Device SM count
+  CUTLASS_THREAD_LOCAL static int device_sms_;
+
+  /// Kernel SM occupancy (in thread blocks)
+  CUTLASS_THREAD_LOCAL static int sm_occupancy_;
+
+ protected:
+  /// Initialize static thread-local members for the thread's current device,
+  /// if necessary.
+  static Status init_device_props() {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::init_device_props()");
+
+    cudaError_t cudart_result;
+
+    // Get current device ordinal
+    int current_ordinal;
+    cudart_result = cudaGetDevice(&current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error "
+                         << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // Done if matches the current static member
+    if (current_ordinal == device_ordinal_) {
+      // Already initialized
+      return Status::kSuccess;
+    }
+
+    // Update SM count member
+    cudart_result = cudaDeviceGetAttribute(
+        &device_sms_, cudaDevAttrMultiProcessorCount, current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error "
+                         << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // If requires more than 48KB: configure for extended, dynamic shared memory
+    if constexpr (kSharedStorageSize >= (48 << 10)) {
+      cudart_result = cudaFuncSetAttribute(
+          Kernel2<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize,
+          kSharedStorageSize);
+      if (cudart_result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error "
+                           << cudaGetErrorString(cudart_result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    // Update SM occupancy member
+    cudart_result = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+        &sm_occupancy_, Kernel2<GemmKernel>, GemmKernel::kThreadCount,
+        kSharedStorageSize, cudaOccupancyDisableCachingOverride);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+          "  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags() returned "
+          "error "
+          << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // Update device ordinal member on success
+    device_ordinal_ = current_ordinal;
+
+    CUTLASS_TRACE_HOST(
+        "  "
+        "device_ordinal: ("
+        << device_ordinal_
+        << "), "
+           "device_sms: ("
+        << device_sms_
+        << "), "
+           "sm_occupancy: ("
+        << sm_occupancy_
+        << ") "
+           "smem_size: ("
+        << kSharedStorageSize
+        << ") "
+           "GemmKernel::kThreadCount: ("
+        << GemmKernel::kThreadCount << ")");
+
+    return Status::kSuccess;
+  }
+
+ protected:
+  //
+  // Instance data members
+  //
+
+  /// Kernel parameters
+  typename GemmKernel::Params params_;
+
+  /// Initialize params member
+  Status init_params(Arguments const& args,
+                     CudaHostAdapter* cuda_adapter = nullptr) {
+    int32_t device_sms = 0;
+    int32_t sm_occupancy = 0;
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+
+      //
+      // Occupancy query using CudaHostAdapter::query_occupancy().
+      //
+
+      if (cuda_adapter) {
+        Status status = cuda_adapter->query_occupancy(
+            &device_sms, &sm_occupancy, kGemmKernelIndex,
+            GemmKernel::kThreadCount, kSharedStorageSize);
+
+        CUTLASS_ASSERT(status == Status::kSuccess);
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      } else {
+        return Status::kErrorInternal;
+      }
+    } else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      // Initialize static device properties, if necessary
+      Status result = init_device_props();
+
+      if (result != Status::kSuccess) {
+        return result;
+      }
+
+      //
+      // Use thread-local static members for occupancy query initialized by call
+      // to `init_device_props()`
+      //
+
+      device_sms = device_sms_;
+      sm_occupancy = sm_occupancy_;
+    }
+
+    // Initialize params member
+    params_ = typename GemmKernel::Params(args, device_sms, sm_occupancy);
+    return Status::kSuccess;
+  }
+
+ public:
+  //---------------------------------------------------------------------------------------------
+  // Stateless API
+  //---------------------------------------------------------------------------------------------
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args,
+                              CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::can_implement()");
+
+    if (!kEnableCudaHostAdapter || cuda_adapter) {
+      dim3 grid = get_grid_shape(args, cuda_adapter);
+
+      if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+            grid.z <= std::numeric_limits<uint16_t>::max())) {
+        return Status::kErrorInvalidProblem;
+      }
+    } else {
+      //
+      // With a null host adapter, a conservative grid shape is computed and
+      // required to conform to CUDA grid dimension limits.
+      //
+
+      int64_t logicalGridM =
+          (int64_t(args.problem_size.m()) + ThreadblockShape::kM - 1) /
+          ThreadblockShape::kM;
+      int64_t logicalGridN =
+          (int64_t(args.problem_size.n()) + ThreadblockShape::kN - 1) /
+          ThreadblockShape::kN;
+      int32_t logicalGridL = args.batch_count;
+
+      if ((int64_t(std::numeric_limits<uint32_t>::max()) < logicalGridM) ||
+          (int64_t(std::numeric_limits<uint16_t>::max()) < logicalGridN) ||
+          (int32_t(std::numeric_limits<uint16_t>::max()) < logicalGridL)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return GemmKernel::can_implement(args);
+  }
+
+  /// Returns the workspace size (in bytes) needed for the problem
+  /// geometry expressed by these arguments
+  static size_t get_workspace_size(Arguments const& args,
+                                   CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_workspace_size()");
+
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return 0;
+    }
+
+    // Get size from parameters
+    size_t workspace_bytes = base.params_.get_workspace_size();
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+    return workspace_bytes;
+  }
+
+  /// Returns the grid extents in thread blocks to launch
+  static dim3 get_grid_shape(Arguments const& args,
+                             CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_grid_shape()");
+
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return dim3(0, 0, 0);
+    }
+
+    // Get dims from parameters
+    dim3 grid_dims = base.params_.get_grid_dims();
+
+    CUTLASS_TRACE_HOST("  tiled_shape: "
+                       << base.params_.get_tiled_shape() << "\n"
+                       << "  grid_dims: {" << grid_dims << "}");
+
+    return grid_dims;
+  }
+
+  /// Returns the maximum number of active thread blocks per multiprocessor
+  static int maximum_active_blocks(CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
+
+    int32_t device_sms = 0;
+    int32_t sm_occupancy = 0;
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+
+      if (cuda_adapter) {
+        Status status = cuda_adapter->query_occupancy(
+            &device_sms, &sm_occupancy, kGemmKernelIndex,
+            GemmKernel::kThreadCount, kSharedStorageSize);
+
+        CUTLASS_ASSERT(status == Status::kSuccess);
+
+        if (status != Status::kSuccess) {
+          return -1;
+        }
+      } else {
+        return -1;
+      }
+    } else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+      // Initialize static device properties, if necessary
+      if (init_device_props() != Status::kSuccess) {
+        return -1;
+      }
+
+      sm_occupancy = sm_occupancy_;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << sm_occupancy_);
+    return sm_occupancy;
+  }
+
+  //---------------------------------------------------------------------------------------------
+  // Stateful API
+  //---------------------------------------------------------------------------------------------
+
+  /// Initializes GEMM state from arguments and workspace memory
+  Status initialize(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr,
+                    CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace "
+                       << workspace
+                       << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize parameters from args
+    Status result = init_params(args, cuda_adapter);
+    if (result != Status::kSuccess) {
+      return result;
+    }
+
+    // Assign and prepare workspace memory
+    if (args.mode == GemmUniversalMode::kGemm) {
+      return params_.init_workspace(workspace, stream);
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBase()::update()");
+    params_.update(args);
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr,
+             CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::run()");
+
+    // Configure grid and block dimensions
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+    dim3 grid = params_.get_grid_dims();
+
+    // Launch kernel
+    CUTLASS_TRACE_HOST(
+        "  "
+        "grid: ("
+        << grid
+        << "), "
+           "block: ("
+        << block
+        << "), "
+           "SMEM: ("
+        << kSharedStorageSize << ")");
+
+    cutlass::arch::synclog_setup();
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      if (cuda_adapter) {
+        void* kernel_params[] = {&params_};
+        return cuda_adapter->launch(grid, block, kSharedStorageSize, stream,
+                                    kernel_params, 0);
+      } else {
+        return Status::kErrorInternal;
+      }
+    } else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+      const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+      sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
+      syclcompat::experimental::launch<Kernel2<GemmKernel>>(
+          syclcompat::experimental::launch_policy{
+              sycl_grid, sycl_block,
+  #if defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
+              sycl::ext::oneapi::experimental::work_group_scratch_size(
+                  kSharedStorageSize)
+  #else
+              syclcompat::experimental::local_mem_size{
+                  static_cast<std::size_t>(kSharedStorageSize)}
+  #endif
+          },
+          q, params_);
+#else
+      Kernel2<GemmKernel><<<grid, block, kSharedStorageSize, stream>>>(params_);
+#endif
+
+      // Query for errors
+      cudaError_t result = cudaGetLastError();
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error "
+                           << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr,
+                    CudaHostAdapter* cuda_adapter = nullptr) {
+    return run(stream, cuda_adapter);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(Arguments const& args, void* workspace = nullptr,
+                    cudaStream_t stream = nullptr,
+                    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Static initializers
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Device ordinal
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_ordinal_ = -1;
+
+/// Device SM count
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_sms_ = -1;
+
+/// Kernel SM occupancy (in thread blocks)
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::sm_occupancy_ = -1;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_k.h b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_k.h
new file mode 100644
index 0000000..19871ee
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/gemm_universal_k.h
@@ -0,0 +1,649 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "gemm_universal.hpp"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,  ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,           ///! Epilogue
+          typename ThreadblockSwizzle_  ///! Threadblock swizzling function
+          >
+class GemmUniversal<
+    Mma_, Epilogue_, ThreadblockSwizzle_, void,
+    // 3.x kernels use the first template argument to define the ProblemShape
+    // We use this invariant to SFINAE dispatch against either the 2.x API or
+    // the 3.x API
+    cute::enable_if_t<not(cute::is_tuple<Mma_>::value ||
+                          IsCutlass3ArrayKernel<Mma_>::value)>> {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC =
+      Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const* ptr_A;
+    void const* ptr_B;
+    void const* ptr_C;
+    void* ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+
+    typename LayoutA::Stride stride_a;
+    typename LayoutB::Stride stride_b;
+    typename LayoutC::Stride stride_c;
+    typename LayoutC::Stride stride_d;
+
+    typename LayoutA::Stride::LongIndex lda;
+    typename LayoutB::Stride::LongIndex ldb;
+    typename LayoutC::Stride::LongIndex ldc;
+    typename LayoutC::Stride::LongIndex ldd;
+
+    int const* ptr_gather_A_indices;
+    int const* ptr_gather_B_indices;
+    int const* ptr_scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    Arguments()
+        : ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          ptr_gather_A_indices(nullptr),
+          ptr_gather_B_indices(nullptr),
+          ptr_scatter_D_indices(nullptr) {}
+
+    /// constructs an arguments structure
+    Arguments(GemmUniversalMode mode, GemmCoord problem_size, int batch_count,
+              typename EpilogueOutputOp::Params epilogue, void const* ptr_A,
+              void const* ptr_B, void const* ptr_C, void* ptr_D,
+              int64_t batch_stride_A, int64_t batch_stride_B,
+              int64_t batch_stride_C, int64_t batch_stride_D,
+              typename LayoutA::Stride stride_a,
+              typename LayoutB::Stride stride_b,
+              typename LayoutC::Stride stride_c,
+              typename LayoutC::Stride stride_d,
+              int const* ptr_gather_A_indices = nullptr,
+              int const* ptr_gather_B_indices = nullptr,
+              int const* ptr_scatter_D_indices = nullptr)
+        : UniversalArgumentsBase(mode, problem_size, batch_count,
+                                 batch_stride_D),
+          epilogue(epilogue),
+          ptr_A(ptr_A),
+          ptr_B(ptr_B),
+          ptr_C(ptr_C),
+          ptr_D(ptr_D),
+          batch_stride_A(batch_stride_A),
+          batch_stride_B(batch_stride_B),
+          batch_stride_C(batch_stride_C),
+          stride_a(stride_a),
+          stride_b(stride_b),
+          stride_c(stride_c),
+          stride_d(stride_d),
+          ptr_gather_A_indices(ptr_gather_A_indices),
+          ptr_gather_B_indices(ptr_gather_B_indices),
+          ptr_scatter_D_indices(ptr_scatter_D_indices) {
+      lda = 0;
+      ldb = 0;
+      ldc = 0;
+      ldd = 0;
+      CUTLASS_TRACE_HOST(
+          "GemmUniversal::Arguments::Arguments() - problem_size: "
+          << problem_size);
+    }
+
+    /// constructs an arguments structure
+    Arguments(GemmUniversalMode mode, GemmCoord problem_size, int batch_count,
+              typename EpilogueOutputOp::Params epilogue, void const* ptr_A,
+              void const* ptr_B, void const* ptr_C, void* ptr_D,
+              int64_t batch_stride_A, int64_t batch_stride_B,
+              int64_t batch_stride_C, int64_t batch_stride_D,
+              typename LayoutA::Stride::LongIndex lda,
+              typename LayoutB::Stride::LongIndex ldb,
+              typename LayoutC::Stride::LongIndex ldc,
+              typename LayoutC::Stride::LongIndex ldd,
+              int const* ptr_gather_A_indices = nullptr,
+              int const* ptr_gather_B_indices = nullptr,
+              int const* ptr_scatter_D_indices = nullptr)
+        : UniversalArgumentsBase(mode, problem_size, batch_count,
+                                 batch_stride_D),
+          epilogue(epilogue),
+          ptr_A(ptr_A),
+          ptr_B(ptr_B),
+          ptr_C(ptr_C),
+          ptr_D(ptr_D),
+          batch_stride_A(batch_stride_A),
+          batch_stride_B(batch_stride_B),
+          batch_stride_C(batch_stride_C),
+          lda(lda),
+          ldb(ldb),
+          ldc(ldc),
+          ldd(ldd),
+          ptr_gather_A_indices(ptr_gather_A_indices),
+          ptr_gather_B_indices(ptr_gather_B_indices),
+          ptr_scatter_D_indices(ptr_scatter_D_indices) {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      CUTLASS_TRACE_HOST(
+          "GemmUniversal::Arguments::Arguments() - problem_size: "
+          << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
+
+      return args;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params
+      : UniversalParamsBase<ThreadblockSwizzle, ThreadblockShape, ElementA,
+                            ElementB, ElementC, LayoutA, LayoutB> {
+    using ParamsBase =
+        UniversalParamsBase<ThreadblockSwizzle, ThreadblockShape, ElementA,
+                            ElementB, ElementC, LayoutA, LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void* ptr_A;
+    void* ptr_B;
+    void* ptr_C;
+    void* ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+
+    int* ptr_gather_A_indices;
+    int* ptr_gather_B_indices;
+    int* ptr_scatter_D_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(Arguments const& args,  /// GEMM application arguments
+           int device_sms,         /// Number of SMs on the device
+           int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+        : ParamsBase(args, device_sms, sm_occupancy),
+          params_A(args.lda
+                       ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda)
+                       : args.stride_a),
+          params_B(args.ldb
+                       ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb)
+                       : args.stride_b),
+          params_C(args.ldc
+                       ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc)
+                       : args.stride_c),
+          params_D(args.ldd
+                       ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd)
+                       : args.stride_d),
+          output_op(args.epilogue),
+          ptr_A(const_cast<void*>(args.ptr_A)),
+          ptr_B(const_cast<void*>(args.ptr_B)),
+          ptr_C(const_cast<void*>(args.ptr_C)),
+          ptr_D(args.ptr_D),
+          batch_stride_A(args.batch_stride_A),
+          batch_stride_B(args.batch_stride_B),
+          batch_stride_C(args.batch_stride_C),
+          ptr_gather_A_indices(const_cast<int*>(args.ptr_gather_A_indices)),
+          ptr_gather_B_indices(const_cast<int*>(args.ptr_gather_B_indices)),
+          ptr_scatter_D_indices(const_cast<int*>(args.ptr_scatter_D_indices)) {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const& args) {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void*>(args.ptr_A);
+      ptr_B = const_cast<void*>(args.ptr_B);
+      ptr_C = const_cast<void*>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int*>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int*>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int*>(args.ptr_scatter_D_indices);
+
+      output_op = args.epilogue;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+ public:
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA =
+        (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value) ? 32
+        : (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =
+        (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value) ? 32
+        : (cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC =
+        (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value) ? 32
+        : (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (cute::is_same<LayoutA,
+                             layout::ColumnMajorInterleaved<32>>::value ||
+               cute::is_same<LayoutA,
+                             layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value ||
+               cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (cute::is_same<LayoutC,
+                             layout::ColumnMajorInterleaved<32>>::value ||
+               cute::is_same<LayoutC,
+                             layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const& args) {
+    return can_implement(args.problem_size);
+  }
+
+ public:
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(Params const& params, SharedStorage& shared_storage) {
+    GemmUniversal op;
+    op(params, shared_storage);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const& params, SharedStorage& shared_storage,
+                        ThreadblockSwizzle& threadblock_swizzle) {
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA* ptr_A = static_cast<ElementA*>(params.ptr_A);
+    ElementB* ptr_B = static_cast<ElementB*>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+        params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    } else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    } else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA* const*>(
+          params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB* const*>(
+          params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+        offset_k, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+    // Compute position within threadblock
+    int thread_idx = ThreadIdxX();
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+        params.params_A, ptr_A, {params.problem_size.m(), problem_size_k},
+        thread_idx, tb_offset_A, params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+        params.params_B, ptr_B, {problem_size_k, params.problem_size.n()},
+        thread_idx, tb_offset_B, params.ptr_gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = ThreadIdxX() % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations =
+        (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // assume identity swizzle
+    MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.n() * Mma::Shape::kN);
+
+    int block_idx = threadblock_tile_offset.m() +
+                    threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC* ptr_C = static_cast<ElementC*>(params.ptr_C);
+    ElementC* ptr_D = static_cast<ElementC*>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // If performing a reduction via split-K, fetch the initial
+      // synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is
+        // currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(),
+                                  params.grid_tiled_shape.k());
+      }
+    } else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    } else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    } else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC* const*>(
+          params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC* const*>(
+          params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C, ptr_C, params.problem_size.mn(), thread_idx,
+        threadblock_offset, params.ptr_scatter_D_indices);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D, ptr_D, params.problem_size.mn(), thread_idx,
+        threadblock_offset, params.ptr_scatter_D_indices);
+
+    Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator
+    // construction
+    if (params.mode == GemmUniversalMode::kGemm &&
+        params.grid_tiled_shape.k() > 1) {
+      // For subsequent threadblocks, the source matrix is held in the 'D'
+      // tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm &&
+        params.grid_tiled_shape.k() > 1) {
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      } else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_epilogue.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_epilogue.hpp
new file mode 100644
index 0000000..bd49242
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_epilogue.hpp
@@ -0,0 +1,562 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+// #include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/xe_visitor_softmax.hpp"
+#include "cutlass/detail/layout.hpp"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class... Args>
+class CollectiveEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>,
+                "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class CtaTileMNK_, class ElementC_, class StrideC_, class ElementD_,
+          class StrideD_, class FusionCallbacks_, class CopyOpG2R_,
+          class SmemLayoutAtomC_, class CopyOpS2R_, class CopyOpR2G_,
+          class SmemLayoutAtomD_, class CopyOpR2S_>
+class CollectiveEpilogue<IntelXeXMX16Group, CtaTileMNK_, ElementC_, StrideC_,
+                         ElementD_, StrideD_, FusionCallbacks_, CopyOpG2R_,
+                         SmemLayoutAtomC_, CopyOpS2R_, CopyOpR2G_,
+                         SmemLayoutAtomD_, CopyOpR2S_> {
+ public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = IntelXeXMX16Group;
+  using CtaTileMNK = CtaTileMNK_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementC_;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+  using CopyOpG2R = CopyOpG2R_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpR2G = CopyOpR2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+
+  using ThreadEpilogueOp =
+      typename fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2R;
+  using GmemTiledCopyD = cute::conditional_t<not cute::is_void_v<ElementD> &&
+                                                 not cute::is_void_v<CopyOpR2G>,
+                                             CopyOpR2G, XE_2D_U32x8x16_ST_N>;
+  using ElementOutput = ElementD;
+  using ElementCompute = ElementAccumulator;
+  using ElementSource = typename FusionCallbacks::ElementSource;
+  using ElementScalar = typename FusionCallbacks::ElementScalar;
+  static constexpr FloatRoundStyle RoundStyle =
+      FloatRoundStyle::round_to_nearest;
+
+  static_assert(
+      cute::is_same_v<
+          typename FusionCallbacks::Operation,
+          fusion::LinearCombination<ElementAccumulator, ElementCompute,
+                                    ElementSource, ElementScalar, RoundStyle>>,
+      "Only Linear Combination Epilogue is supported for Grouped GEMM at the "
+      "moment.");
+
+  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
+
+  static_assert(cute::rank(CtaTileMNK{}) == 3,
+                "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+  static_assert(cute::rank(InternalStrideC{}) == 3,
+                "StrideC must be rank-3: [M, N, L]");
+  static_assert(cute::rank(InternalStrideD{}) == 3,
+                "StrideD must be rank-3: [M, N, L]");
+
+  static_assert(std::is_same_v<CopyOpS2R, void>,
+                "Copy operation to shared memory is not supported");
+  static_assert(std::is_same_v<CopyOpR2S, void>,
+                "Copy operation to shared memory is not supported");
+  static_assert(std::is_same_v<SmemLayoutAtomC, void>,
+                "Copy operation to shared memory is not supported");
+  static_assert(std::is_same_v<SmemLayoutAtomD, void>,
+                "Copy operation to shared memory is not supported");
+
+  using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
+  using Trait_C = Copy_Traits<GmemTiledCopyC, InternalStrideC>;
+  using XE_Copy_C = decltype(make_tiled_copy(
+      Copy_Atom<Trait_C, ElementC>{}, Layout<CopyThreadShape>{},
+      make_layout(
+          shape_div(typename Trait_C::BlockShape{}, CopyThreadShape{}))));
+  using Trait_D = Copy_Traits<GmemTiledCopyD, InternalStrideD>;
+  using XE_Copy_D = decltype(make_tiled_copy(
+      Copy_Atom<Trait_D, ElementD>{}, Layout<CopyThreadShape>{},
+      make_layout(
+          shape_div(typename Trait_D::BlockShape{}, CopyThreadShape{}))));
+
+ private:
+  // constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_source_supported = false;
+  constexpr static bool is_destination_supported =
+      not cute::is_void_v<ElementD> && not cute::is_void_v<CopyOpR2G>;
+
+ public:
+  using EmptyType = cute::tuple<>;
+  using SmemCStorage = EmptyType;
+  using SmemDStorage = EmptyType;
+
+  struct TensorStorageImpl : cute::tuple<SmemCStorage, SmemDStorage> {
+    using FusionStorage = typename FusionCallbacks::SharedStorage;
+    FusionStorage thread;
+  };
+
+  struct SharedStorage {
+    using TensorStorage = TensorStorageImpl;
+
+    TensorStorage tensors;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+
+  using TensorC =
+      decltype(make_tensor(make_gmem_ptr(static_cast<ElementC const*>(nullptr)),
+                           make_shape(0, 0, 0), InternalStrideC{}));  //(m, n)
+  using TensorD =
+      decltype(make_tensor(make_gmem_ptr(static_cast<ElementD*>(nullptr)),
+                           make_shape(0, 0, 0), InternalStrideD{}));  //(m, n)
+  using EpilogueTensors = cute::tuple<TensorC, TensorD>;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const** ptr_C;
+    StrideC dC;
+    ElementD** ptr_D;
+    StrideD dD;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    typename FusionCallbacks::Params thread{};
+    XE_Copy_C xe_load_c;
+    XE_Copy_D xe_store_d;
+    ElementC const** ptr_C;
+    StrideC dC;
+    ElementD** ptr_D;
+    StrideD dD;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(
+      ProblemShape const& problem_shape, Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only
+    // rank-3 (MNK)
+    auto problem_shape_MNL = repeat_like(
+        typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto [M, N, L] = problem_shape_MNL;
+
+    XE_Copy_C xe_load_c = {};
+    if constexpr (is_source_supported) {
+      ElementC const* ptr_C_first_batch =
+          reinterpret_cast<ElementC const*>(args.ptr_C);
+      TensorC mC_mnl =
+          make_tensor(make_gmem_ptr(ptr_C_first_batch),
+                      make_layout(make_shape(M, N, L), InternalStrideC{}));
+      xe_load_c = {xe_load_c.with(mC_mnl)};
+    }
+
+    XE_Copy_D xe_store_d = {};
+    if constexpr (is_destination_supported) {
+      ElementD* ptr_D_first_batch = reinterpret_cast<ElementD*>(args.ptr_D);
+      TensorD mD_mnl =
+          make_tensor(make_gmem_ptr(ptr_D_first_batch),
+                      make_layout(make_shape(M, N, L), InternalStrideD{}));
+      xe_store_d = {xe_store_d.with(mD_mnl)};
+    }
+
+    return {FusionCallbacks::to_underlying_arguments(problem_shape, args.thread,
+                                                     workspace),
+            xe_load_c,
+            xe_store_d,
+            args.ptr_C,
+            args.dC,
+            args.ptr_D,
+            args.dD};
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const& problem_shape,
+                                   Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(
+      ProblemShape const& problem_shape, Arguments const& args, void* workspace,
+      cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape problem_shape, Arguments const& args) {
+    constexpr int copy_alignment_bits = 128;
+    constexpr int batch_alignment_bits = 512;
+
+    bool implementable = true;
+    bool fusion_implementable = true;
+
+    for (int i = 0; i < problem_shape.groups(); ++i) {
+      auto problem_shape_MNKL =
+          append<4>(problem_shape.get_host_problem_shape(i), 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      if constexpr (is_destination_supported) {
+        constexpr int min_aligned_elements_D =
+            copy_alignment_bits / sizeof_bits<ElementD>::value;
+        implementable &=
+            cutlass::detail::check_alignment<min_aligned_elements_D>(
+                cute::make_shape(M, N, L), InternalStrideD{});
+        if (L > 1) {
+          constexpr int min_batch_aligned_elements_D =
+              batch_alignment_bits / sizeof_bits<ElementD>::value;
+          implementable &=
+              get<2>(InternalStrideD{}) % min_batch_aligned_elements_D == 0;
+        }
+      }
+
+      if constexpr (is_source_supported) {
+        constexpr int min_aligned_elements_C =
+            copy_alignment_bits / sizeof_bits<ElementC>::value;
+        implementable &=
+            cutlass::detail::check_alignment<min_aligned_elements_C>(
+                cute::make_shape(M, N, L), InternalStrideC{});
+        if (L > 1) {
+          constexpr int min_batch_aligned_elements_C =
+              batch_alignment_bits / sizeof_bits<ElementC>::value;
+          implementable &=
+              get<2>(InternalStrideC{}) % min_batch_aligned_elements_C == 0;
+        }
+      }
+
+      fusion_implementable =
+          fusion_implementable &&
+          FusionCallbacks::can_implement(problem_shape_MNKL, args.thread);
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment "
+          "requirements for XE 2D copy.\n");
+    }
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements "
+          "for FusionCallbacks.\n");
+    }
+
+    return implementable && fusion_implementable;
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params_,
+                     TensorStorage const& shared_storage_)
+      : params(params_),
+        fusion_callbacks(params_.thread, shared_storage_.thread) {}
+
+  CUTLASS_DEVICE
+  bool is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  template <class ProblemShapeMNKL, class TileShapeMNK, class TileCoordMNKL,
+            class Accumulator, class TiledMma, class LoadStoreTensor>
+  CUTLASS_DEVICE void operator()(ProblemShapeMNKL problem_shape_mnkl,
+                                 TileShapeMNK tile_shape_MNK,
+                                 TileCoordMNKL tile_coord_mnkl,
+                                 Accumulator accumulators, TiledMma tiled_mma,
+                                 int thread_idx,
+                                 LoadStoreTensor const& load_store_tensors) {
+    (void)tiled_mma;
+    using namespace cute;
+
+    static_assert(cute::rank(CtaTileMNK{}) == 3,
+                  "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+    static_assert(cute::rank(InternalStrideC{}) == 3,
+                  "StrideC must be rank-3: [M, N, L]");
+    static_assert(cute::rank(InternalStrideD{}) == 3,
+                  "StrideD must be rank-3: [M, N, L]");
+
+    using MmaAtomShape = typename TiledMma::AtomShape_MNK;
+    static constexpr auto BLK_M = get<0>(CtaTileMNK{});
+    static constexpr auto BLK_N = get<1>(CtaTileMNK{});
+    static constexpr auto BLK_K = get<2>(CtaTileMNK{});
+    // static_assert(is_same_v<typename TiledMma::ThrLayoutVMNK, int>,
+    // "assertion fail");
+    static constexpr auto ATOM_M =
+        get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
+    static constexpr auto ATOM_N =
+        get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
+    static constexpr auto ATOM_K =
+        get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
+
+    static_assert(
+        BLK_M % ATOM_M == 0 && BLK_N % ATOM_N == 0 && BLK_K % ATOM_K == 0,
+        "expected CTATileMNK to be evenly divided by TiledMma::ThrLayoutVMNK");
+    static constexpr auto SG_M = BLK_M / ATOM_M;
+    static constexpr auto SG_N = BLK_N / ATOM_N;
+    static constexpr auto SG_K = BLK_K / ATOM_K;
+    using SubgroupTileShape =
+        Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;
+
+    static constexpr int FragsM =
+        get<0>(SubgroupTileShape{}) /
+        get<0>(MmaAtomShape());  // A frags per sub_group
+    static constexpr int FragsN =
+        get<1>(SubgroupTileShape{}) /
+        get<1>(MmaAtomShape());  // B frags per sub_group
+
+    static constexpr int FragmentSize =
+        (get<0>(MmaAtomShape()) * get<1>(MmaAtomShape())) / SubgroupSize;
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+    auto m_sg = get_sub_group_id() / ATOM_N;
+    auto n_sg = get_sub_group_id() % ATOM_N;
+
+    // Get the layout and reconstruct the MN mapping equivalent to the old
+    // get_layoutS_MN()
+    auto layoutS_TV = params.xe_store_d.get_layoutS_TV();
+    auto mn_shape = shape(typename decltype(params.xe_store_d)::Tiler_MN{});
+    auto layoutS_MN = right_inverse(layoutS_TV).with_shape(mn_shape);
+    using EpilogueTile = decltype(layoutS_MN.shape());
+
+    auto sg_local_m_coord = get_sub_group_id() / ATOM_N;
+    auto sg_local_n_coord = get_sub_group_id() % ATOM_N;
+
+    auto sg_m_coord = m_coord * ATOM_M + sg_local_m_coord;
+    auto sg_n_coord = n_coord * ATOM_N + sg_local_n_coord;
+    auto sg_coord = make_coord(sg_m_coord, sg_n_coord, k_coord, l_coord);
+
+    bool is_C_load_needed =
+        is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Represent the full output tensor
+    Tensor mD_mnl = cute::get_xe_tensor(make_shape(M, N, L));
+
+    // Tile the output tensor per WG and select the tile for current WG
+    Tensor g_wg_D =
+        local_tile(mD_mnl, take<0, 2>(CtaTileMNK{}),
+                   make_coord(m_coord, n_coord, l_coord));  // (BLK_M,BLK_N)
+
+    // Tile the output tensor per SG and select tile for the current SG
+    Tensor gD = local_tile(g_wg_D, take<0, 2>(SubgroupTileShape{}),
+                           make_coord(m_sg, n_sg));  // (SG_M,SG_N)
+
+    auto thread_xe_store_d = params.xe_store_d.get_thread_slice(thread_idx);
+    Tensor tCgD = thread_xe_store_d.partition_D(gD);
+
+    Tensor trC =
+        make_tensor<typename TiledMma::ValTypeC>(Shape<Int<FragmentSize>>{});
+    Tensor trD_compute =
+        make_tensor<ElementCompute>(Shape<Int<FragmentSize>>{});
+
+    // Because Sm90 uses shared memory, they are not tied to using the same
+    // accumulator values for MMA and Epilogue. But because we are operating
+    // directly in the accumulators, we need to be sure that we are operating on
+    // the same values.
+    ThrCopy thread_g2r = params.xe_load_c.get_slice(thread_idx);
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M, N));  // (M,N)
+    Tensor cD = local_tile(mD_crd, take<0, 2>(SubgroupTileShape{}),
+                           make_coord(sg_m_coord, sg_n_coord));
+    Tensor cD_mn = local_tile(mD_crd, take<0, 2>(CtaTileMNK{}),
+                              make_coord(m_coord, n_coord));  // (CTA_M,CTA_N)
+    Tensor tRS_cD_mn = thread_g2r.partition_S(
+        flat_divide(cD_mn, EpilogueTile{}));  // (G2R,G2R_M,G2R_N,EPI_M,EPI_N)
+    Tensor tRS_cD =
+        make_coord_tensor(tRS_cD_mn.layout());  // (G2R,G2R_M,G2R_N,EPI_M,EPI_N)
+
+    // Get the fusion callbacks
+    // Arguments passed here relate to sub-group tiles, rather than CTA
+    // (work-group) tiles
+    constexpr bool RefSrc = true;
+    auto residue_mn = make_coord(M, N);  // TODO(Codeplay): this is not correct
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+        problem_shape_mnkl,
+        SubgroupTileShape{},
+        sg_coord,
+        tiled_mma,
+        EpilogueTile{},
+        params.xe_store_d,
+        cD,
+        residue_mn,
+        tRS_cD,
+        residue_mn,
+        trC,
+        thread_idx,
+    };
+    auto cst_callbacks =
+        fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(
+            cst_args);
+
+    cst_callbacks.begin();
+
+    auto acc_frag = recast<Array<ElementCompute, FragmentSize>>(accumulators);
+    auto trD_compute_frag =
+        recast<Array<ElementCompute, FragmentSize>>(trD_compute);
+
+    Tensor trD = make_tensor<ElementOutput>(Shape<Int<FragmentSize>>{});
+    auto trD_frag = recast<Array<ElementOutput, FragmentSize>>(trD);
+
+    constexpr int ValuesLoaded = FragsM * FragsN * FragmentSize * SubgroupSize *
+                                 ATOM_M * ATOM_N * ATOM_K;
+    constexpr int MN = get<0>(CtaTileMNK{}) * get<1>(CtaTileMNK{});
+    static_assert(
+        ValuesLoaded == MN,
+        "the total elements loaded by all threads should be the same as MxN");
+
+    auto synchronize = [&]() {};
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < FragsN; epi_n++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < FragsM; epi_m++) {
+        if (is_C_load_needed) {
+          // coordinates for C and D are the same
+          copy(params.xe_load_c.with(get<0>(load_store_tensors)),
+               tCgD(_, epi_m, epi_n), trC);
+        }
+
+        cst_callbacks.previsit(epi_m, epi_n, 0, is_C_load_needed);
+
+        auto acc_frag_mn = acc_frag(_, epi_m, epi_n);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size<0>(trD_compute_frag); ++epi_v) {
+          trD_compute_frag(epi_v) =
+              cst_callbacks.visit(acc_frag_mn(epi_v), epi_v, epi_m, epi_n);
+        }
+        cst_callbacks.reduce(nullptr, synchronize, epi_m, epi_n,
+                             (epi_m == FragsM - 1 && epi_n == FragsN - 1),
+                             trD_compute_frag);
+
+        if constexpr (is_destination_supported) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(trD_compute_frag); ++i) {
+            trD_frag(i) =
+                cutlass::NumericArrayConverter<ElementOutput, ElementCompute,
+                                               FragmentSize>{}(
+                    trD_compute_frag(i));
+          }
+          copy(params.xe_store_d.with(get<1>(load_store_tensors)), trD,
+               tCgD(_, epi_m, epi_n));
+        }
+      }
+    }
+
+    cst_callbacks.end();
+  }
+
+  template <typename ProblemShape_MNKL>
+  CUTLASS_DEVICE auto update_tensor_shape_stride(
+      int32_t const& next_group, ProblemShape_MNKL const& problem_shape_mnkl) {
+    auto [M, N, K, L] = problem_shape_mnkl;
+
+    TensorC mC_mnl;
+    TensorD mD_mnl;
+    if constexpr (is_source_supported) {
+      ElementC const* ptr_C_curr_batch =
+          reinterpret_cast<ElementC const*>(params.ptr_C[next_group]);
+      mC_mnl =
+          make_tensor(make_gmem_ptr(ptr_C_curr_batch),
+                      make_layout(make_shape(M, N, L), params.dC[next_group]));
+    }
+
+    if constexpr (is_destination_supported) {
+      ElementD* ptr_D_curr_batch =
+          reinterpret_cast<ElementD*>(params.ptr_D[next_group]);
+      mD_mnl =
+          make_tensor(make_gmem_ptr(ptr_D_curr_batch),
+                      make_layout(make_shape(M, N, L), params.dD[next_group]));
+    }
+    return cute::make_tuple(mC_mnl, mD_mnl);
+  }
+
+ private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace collective
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_mma.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_mma.hpp
new file mode 100644
index 0000000..a2abb4b
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/xe_array_mma.hpp
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Stages, class Schedule, class TileShape_, class ElementA_,
+          class StrideA_, class ElementB_, class StrideB_, class TiledMma_,
+          class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_,
+          class TransformA_, class GmemTiledCopyB_, class SmemLayoutAtomB_,
+          class SmemCopyAtomB_, class TransformB_>
+struct CollectiveMma<MainloopIntelXeXMX16Group<Stages, Schedule>, TileShape_,
+                     ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
+                     GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
+                     TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_,
+                     SmemCopyAtomB_, TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopIntelXeXMX16Group<Stages, Schedule>;
+  using WorkgroupTileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(
+      platform::is_same<ElementA, ElementB>::value,
+      "MainloopIntelXeXMX16Array requires that A and B have same type.");
+
+  static_assert(std::is_same_v<TransformA, cute::identity>,
+                "Transformation for A is not currently supported on Intel PVC");
+  static_assert(std::is_same_v<TransformB, cute::identity>,
+                "Transformation for B is not currently supported on Intel PVC");
+
+  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
+
+  using MmaAtomShape = typename TiledMma::AtomShape_MNK;
+
+  static constexpr auto BLK_M = get<0>(WorkgroupTileShape{});
+  static constexpr auto BLK_N = get<1>(WorkgroupTileShape{});
+  static constexpr auto BLK_K = get<2>(WorkgroupTileShape{});
+
+  static constexpr auto ATOM_M =
+      get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr auto ATOM_N =
+      get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr auto ATOM_K =
+      get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
+
+  static constexpr auto SG_M = ceil_div(BLK_M, ATOM_M);
+  static constexpr auto SG_N = ceil_div(BLK_N, ATOM_N);
+  static constexpr auto SG_K = ceil_div(BLK_K, ATOM_K);
+  using SubgroupTileShape =
+      Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;
+
+  static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
+  static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
+
+  using Copy_A = typename Copy_Traits<
+      GmemTiledCopyA, InternalStrideA>::template DefaultTiledCopy<ElementA>;
+  using Copy_B = typename Copy_Traits<
+      GmemTiledCopyB, InternalStrideB>::template DefaultTiledCopy<ElementB>;
+
+  using TensorMKL =
+      decltype(make_tensor(make_gmem_ptr(static_cast<ElementA const*>(nullptr)),
+                           make_shape(0, 0, 0), InternalStrideA{}));  //(m, k)
+  using TensorNKL =
+      decltype(make_tensor(make_gmem_ptr(static_cast<ElementB const*>(nullptr)),
+                           make_shape(0, 0, 0), InternalStrideB{}));  //(n, k)
+  using MainloopTensors = cute::tuple<TensorMKL, TensorNKL>;
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  struct Params {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(
+      ProblemShape const& problem_shape, Arguments const& args,
+      void* workspace) {
+    (void)workspace;
+
+    auto problem_shape_MNK = repeat_like(
+        typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    ;
+    auto init_M = get<0>(problem_shape_MNK);
+    auto init_N = get<1>(problem_shape_MNK);
+    auto init_K = get<2>(problem_shape_MNK);
+
+    return Params{args.ptr_A, args.dA, args.ptr_B, args.dB};
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape problem_shapes,
+                            Arguments const& args) {
+    constexpr int copy_alignment_bits = 128;
+    constexpr int batch_alignment_bits = 512;
+    auto problem_shape_MNKL = append<4>(problem_shapes, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool implementable = true;
+
+    constexpr int min_aligned_elements_A =
+        copy_alignment_bits / sizeof_bits<ElementA>::value;
+    constexpr int min_aligned_elements_B =
+        copy_alignment_bits / sizeof_bits<ElementB>::value;
+    constexpr int min_batch_aligned_elements_A =
+        batch_alignment_bits / sizeof_bits<ElementA>::value;
+    constexpr int min_batch_aligned_elements_B =
+        batch_alignment_bits / sizeof_bits<ElementB>::value;
+    for (int i = 0; i < problem_shapes.groups(); i++) {
+      auto problem_shape_MNKL =
+          append<4>(problem_shapes.get_host_problem_shape(i), 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      implementable &= cutlass::detail::check_alignment<min_aligned_elements_A>(
+          cute::make_shape(M, K, L), InternalStrideA{});
+      implementable &= cutlass::detail::check_alignment<min_aligned_elements_B>(
+          cute::make_shape(N, K, L), InternalStrideB{});
+
+      if (L > 1) {
+        implementable &=
+            get<2>(InternalStrideA{}) % min_batch_aligned_elements_A == 0;
+        implementable &=
+            get<2>(InternalStrideB{}) % min_batch_aligned_elements_B == 0;
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment "
+          "requirements for XE 2D copy.\n");
+    }
+
+    return implementable;
+  }
+
+  /// Perform a subgroup-scoped matrix multiply-accumulate
+  template <class FrgTensorD, class TensorA, class TensorB, class FrgTensorC,
+            class KTileIterator, class BlkCoord, class LoadTensors>
+  CUTLASS_DEVICE void operator()(FrgTensorD& accum, TensorA gA, TensorB gB,
+                                 FrgTensorC const& src_accum,
+                                 KTileIterator k_tile_iter,
+                                 int const& k_tile_count,
+                                 BlkCoord const& blk_coord, int const& K_start,
+                                 int const& thread_idx, Params const& mainloop,
+                                 LoadTensors const& load_tensors) {
+    static_assert(is_rmem<FrgTensorD>::value,
+                  "D tensor must be rmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value,
+                  "C tensor must be rmem resident.");
+
+    (void)thread_idx;
+
+    Copy_A tiled_copy_a{Copy_A{}.with(get<0>(load_tensors))};
+    Copy_B tiled_copy_b{Copy_B{}.with(get<1>(load_tensors))};
+
+    auto thr_copy_A = tiled_copy_a.get_slice(thread_idx);
+    auto thr_copy_B = tiled_copy_b.get_slice(thread_idx);
+
+    // Instantiate the MMA object and get thread slice
+    TiledMma tiled_mma;
+    // TODO(Codeplay): see if we can make this nicer
+    // To make all work items in a subgroup have the same global tensors pass in
+    // the index of work item 0 in each subgroup
+    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto first_thread_in_sg_idx =
+        sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
+    auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
+
+    // Partition global counting tensors for MMA
+    Tensor tCgA = thr_mma.partition_A(gA);
+    Tensor tCgB = thr_mma.partition_B(gB);
+
+    Tensor tCrA = make_tensor<ElementA>(
+        make_fragment_layout(tiled_copy_a, tCgA(_, _, _, 0).shape()));
+    Tensor tCrB = make_tensor<ElementB>(
+        make_fragment_layout(tiled_copy_b, tCgB(_, _, _, 0).shape()));
+
+    // Retile registers for copies
+    Tensor tArA = thr_copy_A.retile_D(tCrA);
+    Tensor tBrB = thr_copy_B.retile_D(tCrB);
+
+    // Retile global counting tensors for copies
+    Tensor tAgA = thr_copy_A.retile_S(tCgA);
+    Tensor tBgB = thr_copy_B.retile_S(tCgB);
+
+    auto tiled_prefetch_a =
+        cute::prefetch_selector<Shape<Int<BLK_M>, Int<BLK_K>>, Num_SGs>(
+            tiled_copy_a);
+    auto tiled_prefetch_b =
+        cute::prefetch_selector<Shape<Int<BLK_N>, Int<BLK_K>>, Num_SGs>(
+            tiled_copy_b);
+    auto thr_prefetch_A = tiled_prefetch_a.get_slice(thread_idx);
+    auto thr_prefetch_B = tiled_prefetch_b.get_slice(thread_idx);
+
+    // Partition global tile for prefetch
+    auto pAgA = thr_prefetch_A.partition_S(gA);
+    auto pBgB = thr_prefetch_B.partition_S(gB);
+
+#if CUTLASS_ENABLE_DEBUG_PRINTS
+    if (cutlass::thread(LOG_THREAD, LOG_GROUP)) {
+      print("======================= A: \n");
+      print("  gA : ");
+      print(gA);
+      print("\n");
+      print("tCgA : ");
+      print(tCgA);
+      print("\n");
+      print("tAgA : ");
+      print(tAgA);
+      print("\n");
+
+      print("=====================  B :\n");
+      print("  gB : ");
+      print(gB);
+      print("\n");
+      print("tCgB : ");
+      print(tCgB);
+      print("\n");
+      print("tBgB : ");
+      print(tBgB);
+      print("\n");
+
+      print("=====================  Config: \n");
+      print("  threads per workgroup : ");
+      print(MaxThreadsPerBlock);
+      print("\n");
+      print("  SubgroupTileShape : ");
+      print(SubgroupTileShape{});
+      print("\n");
+    }
+#endif
+
+    //
+    // Mainloop
+    //
+    const auto k_start_idx = crd2idx((*k_tile_iter), make_shape(K_start));
+    constexpr int barrier_scope = 2;
+    int prefetch_k = k_start_idx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (; prefetch_k < DispatchPolicy::Stages; prefetch_k++) {
+      prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
+      prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
+    }
+
+    for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx;
+         k_tile++, prefetch_k++) {
+      barrier_arrive(barrier_scope);
+      // Copy gmem to rmem for the first k_tile
+      copy(tiled_copy_a, tAgA(_, _, _, k_tile), tArA);
+      copy(tiled_copy_b, tBgB(_, _, _, k_tile), tBrB);
+
+      if (prefetch_k < k_tile_count) {
+        prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));
+        prefetch(tiled_prefetch_b, pBgB(_, _, _, prefetch_k));
+      }
+
+      cute::gemm(tiled_mma, tCrA, tCrB, accum);
+      barrier_wait(barrier_scope);
+    }
+  }
+
+  template <typename ProblemShape_MNKL>
+  CUTLASS_DEVICE auto update_tensor_shape_stride(
+      Params const& mainloop_params, int32_t const& next_group,
+      ProblemShape_MNKL const& problem_shape_mnkl) {
+    const int32_t M = get<0>(problem_shape_mnkl);
+    const int32_t N = get<1>(problem_shape_mnkl);
+    const int32_t K = get<2>(problem_shape_mnkl);
+
+    ElementA const* ptr_A_curr_batch =
+        reinterpret_cast<ElementA const*>(mainloop_params.ptr_A[next_group]);
+    ElementB const* ptr_B_curr_batch =
+        reinterpret_cast<ElementB const*>(mainloop_params.ptr_B[next_group]);
+
+    Tensor mA = make_tensor(make_gmem_ptr(ptr_A_curr_batch),
+                            make_shape(M, K, (int32_t)1),
+                            mainloop_params.dA[next_group]);
+    Tensor mB = make_tensor(make_gmem_ptr(ptr_B_curr_batch),
+                            make_shape(N, K, (int32_t)1),
+                            mainloop_params.dB[next_group]);
+
+    return cute::make_tuple(mA, mB);
+  }
+};
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_builder.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_builder.hpp
new file mode 100644
index 0000000..ca749c3
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/xe_builder.hpp
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/arch/arch.h>
+#include <cute/arch/copy.hpp>         // cute::DefaultCopy
+#include <cute/util/type_traits.hpp>  // cute::is_base_of_v
+// #include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "xe_array_epilogue.hpp"
+#include "xe_callbacks.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify epilogue subtile shape or dispatch to automatic computation
+// of subtile shape
+struct EpilogueTileAuto {};
+
+// Used to let the builder pick the epilogue schedule automatically.
+// Can be overridden with kernel schedule tags in
+// cutlass/gemm/dispatch_policy.hpp
+struct EpilogueScheduleAuto {};
+
+template <
+    class ArchTag, class OpClass, class TileShape_MNK, class ClusterShape_MNK,
+    class EpilogueTileType, class ElementAccumulator, class ElementCompute,
+    class ElementC, class GmemLayoutTagC, int AlignmentC, class ElementD,
+    class GmemLayoutTagD, int AlignmentD, class EpilogueScheduleType,
+    class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<
+        ElementD, ElementCompute, ElementC, ElementCompute>,
+    class Enable = void>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+                "Could not build a collective epilogue for given parameters.");
+};
+
+// helper sub-builder for epilogue fusion callbacks (for internal use by
+// CollectiveBuilder only)
+namespace detail {
+
+// callbacks builder with operation tag
+template <class DispatchPolicy, class FusionOp, class TileShape_MNK,
+          class EpilogueTile_MN, class ElementAccumulator,
+          class AccLoadOp = cute::DefaultCopy, class = void>
+struct CallbacksBuilder {
+  using Callbacks = fusion::FusionCallbacks<DispatchPolicy, FusionOp,
+                                            TileShape_MNK, EpilogueTile_MN>;
+};
+
+// callbacks builder with callbacks passthrough
+template <class DispatchPolicy, class FusionCallbacks, class TileShape_MNK,
+          class EpilogueTile_MN, class AccLoadOp, class ElementAccumulator>
+struct CallbacksBuilder<DispatchPolicy, FusionCallbacks, TileShape_MNK,
+                        EpilogueTile_MN, ElementAccumulator, AccLoadOp,
+                        cute::enable_if_t<not cute::is_base_of_v<
+                            fusion::FusionOperation, FusionCallbacks>>> {
+  using Callbacks = FusionCallbacks;
+};
+
+}  // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+namespace detail {
+template <class FusionOp>
+struct FusionOpInfo {
+  static_assert(cutlass::detail::dependent_false<FusionOp>,
+                "Could not find a builder specialization.");
+};
+
+template <class ElementD, class ElementCompute, class ElementC>
+struct FusionOpInfo<cutlass::epilogue::fusion::LinearCombination<
+    ElementD, ElementCompute, ElementC, ElementCompute>> {
+  constexpr static bool HasBuilder = true;
+
+  template <class DispatchPolicy, class TileShape_MNK, class EpilogueTile,
+            class>
+  using FusionCallbacks = cutlass::epilogue::fusion::FusionCallbacks<
+      DispatchPolicy,
+      cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute,
+                                                   ElementC, ElementCompute>,
+      TileShape_MNK, EpilogueTile>;
+};
+
+template <template <class> class ActivationFn, class ElementD,
+          class ElementCompute, class ElementC>
+struct FusionOpInfo<cutlass::epilogue::fusion::LinCombEltAct<
+    ActivationFn, ElementD, ElementCompute, ElementC, ElementCompute>> {
+  constexpr static bool HasBuilder = true;
+  template <class DispatchPolicy, class TileShape_MNK, class EpilogueTile,
+            class>
+
+  using FusionCallbacks = cutlass::epilogue::fusion::FusionCallbacks<
+      DispatchPolicy,
+      cutlass::epilogue::fusion::LinCombEltAct<
+          ActivationFn, ElementD, ElementCompute, ElementC, ElementCompute>,
+      TileShape_MNK, EpilogueTile>;
+};
+
+template <class GmemLayoutTagC, template <class> class ActivationFn,
+          class ElementD, class ElementCompute, class ElementC>
+struct FusionOpInfo<cutlass::epilogue::fusion::LinCombDeEltAct<
+    GmemLayoutTagC, ActivationFn, ElementD, ElementCompute, ElementC,
+    ElementCompute>> {
+  constexpr static bool HasBuilder = true;
+
+  template <class DispatchPolicy, class TileShape_MNK, class EpilogueTile,
+            class CopyOpG2R>
+  using FusionCallbacks = cutlass::epilogue::fusion::FusionCallbacks<
+      DispatchPolicy,
+      cutlass::epilogue::fusion::LinCombDeEltAct<GmemLayoutTagC, ActivationFn,
+                                                 ElementD, ElementCompute,
+                                                 ElementC, ElementCompute>,
+      TileShape_MNK, EpilogueTile, CopyOpG2R>;
+};
+}  // namespace detail
+
+// Intel epilogue builder
+template <class TileShape_MNK, class EpilogueTileType, class ElementAccumulator,
+          class ElementCompute, class ElementC, class GmemLayoutTagC,
+          int AlignmentC, class ElementD, class GmemLayoutTagD, int AlignmentD,
+          class EpilogueScheduleType, class FusionOpOrCallbacks>
+struct CollectiveBuilder<
+    arch::IntelXe, arch::OpClassTensorOp, TileShape_MNK,
+    Shape<_1, _1, _1>,  // Cluster Shape
+    EpilogueTileType, ElementAccumulator, ElementCompute, ElementC,
+    GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
+    EpilogueScheduleType, FusionOpOrCallbacks,
+    cute::enable_if_t<
+        cute::is_same_v<EpilogueTileType, EpilogueTileAuto> &&
+        cute::is_any_of_v<EpilogueScheduleType, EpilogueScheduleAuto,
+                          IntelXeXMX16, IntelXeXMX16Group> &&
+        detail::FusionOpInfo<FusionOpOrCallbacks>::HasBuilder>> {
+#ifdef SYCL_NVIDIA_TARGET
+  static_assert(cutlass::detail::dependent_false<arch::IntelXe>,
+                "Trying to use Intel pipeline on Non Intel hardware");
+#endif
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(cute::is_any_of_v<ElementC, float, bfloat16_t, half_t, void>,
+                "ElementC needs to be one of: float, bfloat, half for the "
+                "Intel pipeline");
+
+  using EpilogueSchedule = std::conditional_t<
+      cute::is_same_v<EpilogueScheduleType, EpilogueScheduleAuto>, IntelXeXMX16,
+      EpilogueScheduleType>;
+  static constexpr bool IsGroup =
+      cute::is_same_v<EpilogueSchedule, IntelXeXMX16Group>;
+  using DispatchPolicy =
+      std::conditional_t<IsGroup, IntelXeXMX16Group, IntelXeXMX16>;
+
+  using StrideC = std::conditional_t<
+      cute::is_tuple_v<std::remove_pointer_t<GmemLayoutTagC>>, GmemLayoutTagC,
+      cutlass::detail::TagToStrideC_t<
+          std::conditional_t<IsGroup, GmemLayoutTagC*, GmemLayoutTagC>>>;
+  using StrideD = std::conditional_t<
+      cute::is_tuple_v<std::remove_pointer_t<GmemLayoutTagD>>, GmemLayoutTagD,
+      cutlass::detail::TagToStrideC_t<
+          std::conditional_t<IsGroup, GmemLayoutTagD*, GmemLayoutTagD>>>;
+
+  static_assert(IsGroup == std::is_pointer_v<StrideC>,
+                "Group GEMM should have a pointer to strides");
+  static_assert(IsGroup == std::is_pointer_v<StrideD>,
+                "Group GEMM should have a pointer to strides");
+  static_assert(get<1>(std::remove_pointer_t<StrideC>{}) == 1,
+                "Only N-Major/Row-Major layouts for C are supported in the xe "
+                "epilogue collective builder");
+  static_assert(get<1>(std::remove_pointer_t<StrideD>{}) == 1,
+                "Only N-Major/Row-Major layouts for D are supported in the xe "
+                "epilogue collective builder");
+
+  using CopyOpG2R = std::conditional_t<
+      is_void_v<ElementC>, void,
+      std::conditional_t<cutlass::sizeof_bits_v<ElementC> == 32,
+                         XE_2D_U32x8x16_LD_N, XE_2D_U16x8x16_LD_N>>;
+  using CopyOpR2G =
+      std::conditional_t<cutlass::sizeof_bits_v<ElementD> == 32,
+                         XE_2D_U32x8x16_ST_N, XE_2D_U16x8x16_ST_N>;
+
+  // Intel Epilogue with Linear Combination does not use shared memory
+  using SmemLayoutAtomC_ = void;
+  using CopyOpS2R_ = void;
+  using SmemLayoutAtomD_ = void;
+  using CopyOpR2S_ = void;
+
+  // TODO(Codeplay): Should FusionCallbacks use DispatchPolicy
+  // IntelXeGroupEpilogue for group gemm? That does not work.
+  using FusionCallbacks = typename detail::FusionOpInfo<FusionOpOrCallbacks>::
+      template FusionCallbacks<
+          std::conditional_t<IsGroup, IntelXeXMX16Group, IntelXeXMX16>,
+          TileShape_MNK, TileShape_MNK, CopyOpG2R>;
+  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
+      DispatchPolicy, TileShape_MNK, ElementAccumulator, StrideC, ElementD,
+      StrideD, FusionCallbacks, CopyOpG2R, SmemLayoutAtomC_, CopyOpS2R_,
+      CopyOpR2G, SmemLayoutAtomD_, CopyOpR2S_>;
+};
+}  // namespace cutlass::epilogue::collective
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_callbacks.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_callbacks.hpp
new file mode 100644
index 0000000..1971159
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/xe_callbacks.hpp
@@ -0,0 +1,770 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Fusion callbacks specializations for the Intel Xe epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/xe_visitor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/xe_visitor_softmax.hpp"
+#include "cutlass/epilogue/fusion/xe_visitor_splitk.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ElementOutput_, class ElementCompute_, class ElementSource_,
+          class ElementScalar_, FloatRoundStyle RoundStyle_,
+          class CtaTileShapeMNK_, class EpilogueTile_>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinearCombination<ElementOutput_, ElementCompute_, ElementSource_,
+                              ElementScalar_, RoundStyle_>,
+    CtaTileShapeMNK_, EpilogueTile_>
+    : Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<
+                                ElementOutput_>::type,
+                            ElementCompute_, ElementSource_, ElementScalar_,
+                            RoundStyle_> {
+  using Impl = Sm90LinearCombination<
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
+      ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_>;
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  using Operation =
+      fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource_,
+                                ElementScalar, RoundStyle_>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0, _0, int64_t>;
+    using StrideBeta = Stride<_0, _0, int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta dBeta = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return {
+          // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
+          {},                             // leaf args : C
+          {
+              // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}},  // leaf args : alpha
+              {},                                // leaf args : acc
+              {}                                 // binary args : multiplies
+          },                                     // end binary op
+          {}                                     // ternary args : multiply_add
+      };  // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+template <template <class> class ActivationFn_, class ElementOutput_,
+          class ElementCompute_, class ElementSource_, class ElementScalar_,
+          FloatRoundStyle RoundStyle_, class CtaTileShapeMNK_,
+          class EpilogueTile_>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+                          ElementSource_, ElementScalar_, RoundStyle_>,
+    CtaTileShapeMNK_, EpilogueTile_>
+    : Sm90LinCombEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+                        ElementSource_, ElementScalar_, RoundStyle_> {
+  using Impl = Sm90LinCombEltAct<
+      ActivationFn_,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
+      ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_>;
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  using Operation =
+      fusion::LinCombEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+                            ElementSource_, ElementScalar_, RoundStyle_>;
+
+  struct Arguments {
+    ElementScalar_ alpha = ElementScalar_(1);
+    ElementScalar_ beta = ElementScalar_(0);
+    ElementScalar_ const* alpha_ptr = nullptr;
+    ElementScalar_ const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0, _0, int64_t>;
+    using StrideBeta = Stride<_0, _0, int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta dBeta = {_0{}, _0{}, 0};
+
+    using ActivationArguments =
+        typename Sm90Compute<ActivationFn_, ElementOutput_, ElementCompute_,
+                             RoundStyle_>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return {
+          // unary op: activation(beta * C + (alpha * acc))
+          {
+              // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
+              {},                             // leaf args : C
+              {
+                  // binary op : alpha * acc
+                  {{alpha}, {alpha_ptr}, {dAlpha}},  // leaf args : alpha
+                  {},                                // leaf args : acc
+                  {}                                 // binary args : multiplies
+              },                                     // end binary op
+              {}      // ternary args : multiply_add
+          },          // end ternary op
+          activation  // unary args: activation
+      };  // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = splitk(alpha * acc + beta * C)
+template <
+    // int FragmentSize,
+    class CtaTileShapeMNK, class EpilogueTile, class ElementOutput,
+    class ElementCompute, class CopyOpR2G, class ElementSource = ElementOutput,
+    class ElementScalar = ElementCompute,
+    FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
+using XeLinCombSplitK = Sm90EVT<
+    XeSplitK<CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute,
+             CopyOpR2G, RoundStyle>,  // splitk(beta * C + (alpha * acc))
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource,
+                          ElementScalar, RoundStyle>  // beta * C + (alpha *
+                                                      // acc)
+    >;
+
+template <
+    // int FragmentSize,
+    class ElementOutput_, class ElementCompute_, class ElementSource_,
+    class ElementScalar_, class CopyOpR2G_, FloatRoundStyle RoundStyle,
+    class CtaTileShapeMNK, class EpilogueTile>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombSplitK<ElementOutput_, ElementCompute_, CopyOpR2G_,
+                          ElementSource_, ElementScalar_, RoundStyle>,
+    CtaTileShapeMNK, EpilogueTile>
+    : XeLinCombSplitK<CtaTileShapeMNK, EpilogueTile, ElementOutput_,
+                      ElementCompute_, CopyOpR2G_, ElementSource_,
+                      ElementScalar_, RoundStyle> {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  using Impl = XeLinCombSplitK<
+      CtaTileShapeMNK, EpilogueTile,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, CopyOpR2G_, ElementSource, ElementScalar, RoundStyle>;
+  using Operation =
+      fusion::LinCombSplitK<ElementOutput_, ElementCompute, CopyOpR2G_,
+                            ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementOutput* output_ptr = nullptr;
+    ElementOutput* output_ptr1 = nullptr;
+    ElementOutput* output_ptr2 = nullptr;
+    size_t NUM_HEAD = 0;
+    size_t NOPE_DIM = 0;
+    size_t ROPE_DIM = 0;
+    operator typename Impl::Arguments() const {
+      return {
+          // unary op: activation(beta * C + (alpha * acc))
+          {
+              // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}},  // leaf args : beta
+              {},                    // leaf args : C
+              {
+                  // binary op : alpha * acc
+                  {{alpha}, {alpha_ptr}},  // leaf args : alpha
+                  {},                      // leaf args : acc
+                  {}                       // binary args : multiplies
+              },                           // end binary op
+              {}                           // ternary args : multiply_add
+          },                               // end ternary op
+          {output_ptr, output_ptr1, output_ptr2, NUM_HEAD, NOPE_DIM,
+           ROPE_DIM}  // unary args: activation
+      };  // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = softmax(alpha * acc + beta * C)
+template <
+    // int FragmentSize,
+    class CtaTileShapeMNK, class EpilogueTile, class ElementOutput,
+    class ElementCompute, class CopyOpR2G, class ElementSource = ElementOutput,
+    class ElementScalar = ElementCompute,
+    FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
+using XeLinCombSoftmaxRow =
+    Sm90EVT<XeSoftmaxRowReduction<
+                CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute,
+                CopyOpR2G, RoundStyle>,  // softmax(beta * C + (alpha * acc))
+            Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource,
+                                  ElementScalar, RoundStyle>  // beta * C +
+                                                              // (alpha * acc)
+            >;
+
+template <
+    // int FragmentSize,
+    class ElementOutput_, class ElementCompute_, class ElementSource_,
+    class ElementScalar_, class CopyOpR2G_, FloatRoundStyle RoundStyle,
+    class CtaTileShapeMNK, class EpilogueTile>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombSoftmaxRow<ElementOutput_, ElementCompute_, CopyOpR2G_,
+                              ElementSource_, ElementScalar_, RoundStyle>,
+    CtaTileShapeMNK, EpilogueTile>
+    : XeLinCombSoftmaxRow<CtaTileShapeMNK, EpilogueTile, ElementOutput_,
+                          ElementCompute_, CopyOpR2G_, ElementSource_,
+                          ElementScalar_, RoundStyle> {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  using Impl = XeLinCombSoftmaxRow<
+      CtaTileShapeMNK, EpilogueTile,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, CopyOpR2G_, ElementSource, ElementScalar, RoundStyle>;
+  using Operation =
+      fusion::LinCombSoftmaxRow<ElementOutput_, ElementCompute, CopyOpR2G_,
+                                ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementOutput* output_ptr = nullptr;
+
+    operator typename Impl::Arguments() const {
+      return {
+          // unary op: activation(beta * C + (alpha * acc))
+          {
+              // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}},  // leaf args : beta
+              {},                    // leaf args : C
+              {
+                  // binary op : alpha * acc
+                  {{alpha}, {alpha_ptr}},  // leaf args : alpha
+                  {},                      // leaf args : acc
+                  {}                       // binary args : multiplies
+              },                           // end binary op
+              {}                           // ternary args : multiply_add
+          },                               // end ternary op
+          {output_ptr}                     // unary args: activation
+      };  // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+template <class StrideAux, class CopyOpG2R, template <class> class ActivationFn,
+          class ElementOutput, class ElementCompute,
+          class ElementAux = ElementOutput, class ElementSource = ElementOutput,
+          class ElementScalar = ElementCompute,
+          FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
+using XeLinCombDeEltAct =
+    Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute,
+                        RoundStyle>,  // activation(beta * C + (alpha * acc),
+                                      // aux)
+            Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource,
+                                  ElementScalar, RoundStyle>,  // beta * C +
+                                                               // (alpha * acc)
+            XeAuxLoad<ElementAux, StrideAux, CopyOpG2R>        // aux
+            >;
+
+// Z = Aux
+// dY = alpha * acc + beta * C
+// D = activation(dY, Z)
+//
+template <class GmemLayoutTagAux, template <class> class ActivationFn,
+          class ElementOutput_, class ElementCompute_, class ElementAux,
+          class ElementSource, class ElementScalar, int AlignmentAux,
+          FloatRoundStyle RoundStyle, class CtaTileShapeMNK, class EpilogueTile,
+          class CopyOpG2R>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombDeEltAct<GmemLayoutTagAux, ActivationFn, ElementOutput_,
+                            ElementCompute_, ElementAux, ElementSource,
+                            ElementScalar, AlignmentAux, RoundStyle>,
+    CtaTileShapeMNK, EpilogueTile, CopyOpG2R>
+    : XeLinCombDeEltAct<cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+                        CopyOpG2R, ActivationFn, ElementOutput_,
+                        ElementCompute_, ElementAux, ElementSource,
+                        ElementScalar, RoundStyle> {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+
+  using Impl =
+      XeLinCombDeEltAct<cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+                        CopyOpG2R, ActivationFn, ElementOutput, ElementCompute,
+                        ElementAux, ElementSource, ElementScalar, RoundStyle>;
+  using Operation =
+      fusion::LinCombDeEltAct<GmemLayoutTagAux, ActivationFn, ElementOutput,
+                              ElementCompute, ElementAux, ElementSource,
+                              ElementScalar, AlignmentAux, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0, _0, int64_t>;
+    using StrideBeta = Stride<_0, _0, int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta dBeta = {_0{}, _0{}, 0};
+
+    using ActivationArguments =
+        typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute,
+                             RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux const* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return {
+          // binary op : activation(beta * C + (alpha * acc), aux)
+          {
+              // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
+              {},                             // leaf args : C
+              {
+                  // binary op : alpha * acc
+                  {{alpha}, {alpha_ptr}, {dAlpha}},  // leaf args : alpha
+                  {},                                // leaf args : acc
+                  {}                                 // binary args : multiplies
+              },                                     // end binary op
+              {}                           // ternary args : multiply_add
+          },                               // end ternary op
+          {aux_ptr, ElementAux(0), dAux},  // leaf args : aux
+          activation                       // binary args : activation
+      };  // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// D = alpha * acc + beta * C + per-row bias
+template <class ElementOutput_, class ElementCompute_, class ElementBias_,
+          class ElementSource_, class ElementScalar_, int AlignmentBias_,
+          FloatRoundStyle RoundStyle_, class CtaTileShapeMNK_,
+          class EpilogueTile_>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombPerRowBias<ElementOutput_, ElementCompute_, ElementBias_,
+                              ElementSource_, ElementScalar_, AlignmentBias_,
+                              RoundStyle_>,
+    CtaTileShapeMNK_, EpilogueTile_>
+    : Sm90LinCombPerRowBias<CtaTileShapeMNK_, ElementOutput_, ElementCompute_,
+                            ElementBias_, ElementSource_, ElementScalar_,
+                            AlignmentBias_, RoundStyle_> {
+  using Impl = Sm90LinCombPerRowBias<
+      CtaTileShapeMNK_,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
+      ElementCompute_, ElementBias_, ElementSource_, ElementScalar_,
+      AlignmentBias_, RoundStyle_>;
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementBias = ElementBias_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  using Operation =
+      fusion::LinCombPerRowBias<ElementOutput_, ElementCompute_, ElementBias_,
+                                ElementSource_, ElementScalar_, AlignmentBias_,
+                                RoundStyle_>;
+
+  struct Arguments {
+    ElementScalar_ alpha = ElementScalar_(1);
+    ElementScalar_ beta = ElementScalar_(0);
+    ElementScalar_ const* alpha_ptr = nullptr;
+    ElementScalar_ const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0, _0, int64_t>;
+    using StrideBeta = Stride<_0, _0, int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta dBeta = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1, _0, int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return {
+          // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
+          {},                             // leaf args : C
+          {
+              // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}},   // leaf args : alpha
+              {},                                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias},  // leaf args : bias
+              {}                                  // ternary args : multiply_add
+          },                                      // end ternary op
+          {}                                      // ternary args : multiply_add
+      };  // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C + per-column bias
+template <
+    int StagesC, class CtaTileShapeMNK, class EpilogueTile, class ElementOutput,
+    class ElementCompute, class ElementBias = ElementOutput,
+    class ElementSource = ElementOutput, class ElementScalar = ElementCompute,
+    int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+    FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest>
+using XeLinCombPerColBias = Sm90EVT<
+    Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute,
+                RoundStyle>,  // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0, _0, int64_t>>,  // beta
+    Sm90SrcFetch<ElementSource>,                                  // C
+    Sm90EVT<
+        Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute,
+                    RoundStyle>,  // alpha * acc + bias
+        Sm90ScalarBroadcast<ElementScalar, Stride<_0, _0, int64_t>>,  // alpha
+        Sm90AccFetch,                                                 // acc
+        XeRowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute,
+                       Stride<_0, _1, int64_t>, AlignmentBias>  // bias
+        >>;
+
+template <class ElementOutput_, class ElementCompute_, class ElementBias_,
+          class ElementSource_, class ElementScalar_, int AlignmentBias_,
+          FloatRoundStyle RoundStyle_, class CtaTileShapeMNK_,
+          class EpilogueTile_>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombPerColBias<ElementOutput_, ElementCompute_, ElementBias_,
+                              ElementSource_, ElementScalar_, AlignmentBias_,
+                              RoundStyle_>,
+    CtaTileShapeMNK_, EpilogueTile_>
+    : XeLinCombPerColBias<_1{} /* Stages */, CtaTileShapeMNK_, EpilogueTile_,
+                          ElementOutput_, ElementCompute_, ElementBias_,
+                          ElementSource_, ElementScalar_, AlignmentBias_,
+                          RoundStyle_> {
+  using Impl = XeLinCombPerColBias<
+      _1{}, CtaTileShapeMNK_, EpilogueTile_,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
+      ElementCompute_, ElementBias_, ElementSource_, ElementScalar_,
+      AlignmentBias_, RoundStyle_>;
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementBias = ElementBias_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  using Operation =
+      fusion::LinCombPerColBias<ElementOutput_, ElementCompute_, ElementBias_,
+                                ElementSource_, ElementScalar_, AlignmentBias_,
+                                RoundStyle_>;
+
+  struct Arguments {
+    ElementScalar_ alpha = ElementScalar_(1);
+    ElementScalar_ beta = ElementScalar_(0);
+    ElementScalar_ const* alpha_ptr = nullptr;
+    ElementScalar_ const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0, _0, int64_t>;
+    using StrideBeta = Stride<_0, _0, int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta dBeta = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0, _1, int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return {
+          // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
+          {},                             // leaf args : C
+          {
+              // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}},   // leaf args : alpha
+              {},                                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias},  // leaf args : bias
+              {}                                  // ternary args : multiply_add
+          },                                      // end ternary op
+          {}                                      // ternary args : multiply_add
+      };  // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+template <int TopK, class ElementOutput_, class ElementCompute_,
+          class ElementSource_, class ElementScalar_,
+          FloatRoundStyle RoundStyle, class CtaTileShapeMNK, class EpilogueTile>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput_, ElementCompute_,
+                                  ElementSource_, ElementScalar_, RoundStyle>,
+    CtaTileShapeMNK, EpilogueTile>
+    : Sm90LinCombTopKSoftmaxCol<TopK, 8 /*FragmentSize*/, CtaTileShapeMNK,
+                                EpilogueTile, ElementOutput_, ElementCompute_,
+                                ElementSource_, ElementScalar_, RoundStyle> {
+  static constexpr int FragmentSize = 8;
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  using Impl = Sm90LinCombTopKSoftmaxCol<
+      TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation =
+      fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute,
+                                    ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    operator typename Impl::Arguments() const {
+      return {
+          // unary op: activation(beta * C + (alpha * acc))
+          {
+              // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}},  // leaf args : beta
+              {},                    // leaf args : C
+              {
+                  // binary op : alpha * acc
+                  {{alpha}, {alpha_ptr}},  // leaf args : alpha
+                  {},                      // leaf args : acc
+                  {}                       // binary args : multiplies
+              },                           // end binary op
+              {}                           // ternary args : multiply_add
+          },                               // end ternary op
+          {}                               // unary args: activation
+      };  // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+
+template <
+    // int FragmentSize,
+    // bool ReuseSmemC,
+    // bool DelayTmaStore,
+    template <class> class ActivationFn_, class ElementOutput_,
+    class ElementCompute_, class ElementBias_, class ElementSource_,
+    class ElementScalar_, int AlignmentBias_, FloatRoundStyle RoundStyle_,
+    class CtaTileShapeMNK_, class EpilogueTile_>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16,
+    fusion::LinCombPerRowBiasEltAct<
+        ActivationFn_, ElementOutput_, ElementCompute_, ElementBias_,
+        ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_>,
+    CtaTileShapeMNK_, EpilogueTile_>
+    : Sm90LinCombPerRowBiasEltAct<CtaTileShapeMNK_, ActivationFn_,
+                                  ElementOutput_, ElementCompute_, ElementBias_,
+                                  ElementSource_, ElementScalar_,
+                                  AlignmentBias_, RoundStyle_> {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementBias = ElementBias_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  using Impl = Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK_, ActivationFn_, ElementOutput, ElementCompute,
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle_>;
+  using Operation = fusion::LinCombPerRowBiasEltAct<
+      ActivationFn_, ElementOutput, ElementCompute, ElementBias, ElementSource,
+      ElementScalar, AlignmentBias, RoundStyle_>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0, _0, int64_t>;
+    using StrideBeta = Stride<_0, _0, int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta dBeta = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1, _0, int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments =
+        typename Sm90Compute<ActivationFn_, ElementOutput, ElementCompute,
+                             RoundStyle_>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return {
+          // unary op : activation(beta * C + (alpha * acc + bias))
+          {
+              // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}},  // leaf args : beta
+              {},                             // leaf args : C
+              {
+                  // ternary op : alpha * acc + bias
+                  {{alpha}, {alpha_ptr}, {dAlpha}},   // leaf args : alpha
+                  {},                                 // leaf args : acc
+                  {bias_ptr, ElementBias(0), dBias},  // leaf args : bias
+                  {}  // ternary args : multiply_add
+              },      // end ternary op
+              {}      // ternary args : multiply_add
+          },          // end ternary op
+          activation  // unary args : activation
+      };  // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+// D = alpha * acc + beta * C, where beta and alpha can be vectors for each
+// batch
+template <class ElementOutput_, class ElementCompute_, class ElementSource_,
+          class ElementScalar_, FloatRoundStyle RoundStyle_,
+          class CtaTileShapeMNK_, class EpilogueTile_>
+struct FusionCallbacks<
+    epilogue::IntelXeXMX16Group,
+    fusion::LinearCombination<ElementOutput_, ElementCompute_, ElementSource_,
+                              ElementScalar_, RoundStyle_>,
+    CtaTileShapeMNK_, EpilogueTile_>
+    : Sm90LinearCombinationPtrArray<
+          typename cutlass::detail::get_unpacked_element_type<
+              ElementOutput_>::type,
+          ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using Impl = Sm90LinearCombinationPtrArray<
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput_>::type,
+      ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_>;
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementSource = ElementSource_;
+  using ElementScalar = ElementScalar_;
+  using Operation =
+      fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource,
+                                ElementScalar, RoundStyle_>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+
+    using StrideAlpha = Stride<_0, _0, int64_t>;
+    using StrideBeta = Stride<_0, _0, int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta dBeta = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return {
+          // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}},  // leaf args : beta
+          {},                                               // leaf args : C
+          {
+              // binary op : alpha * acc
+              {{alpha},
+               {alpha_ptr},
+               {alpha_ptr_array},
+               {dAlpha}},  // leaf args : alpha
+              {},          // leaf args : acc
+              {}           // binary args : multiplies
+          },               // end binary op
+          {}               // ternary args : multiply_add
+      };  // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/xpu/cutlass_kernels/collective/gemm/xe_gemm_array_cooperative.hpp b/csrc/xpu/cutlass_kernels/collective/gemm/xe_gemm_array_cooperative.hpp
new file mode 100644
index 0000000..ccc1e37
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/collective/gemm/xe_gemm_array_cooperative.hpp
@@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <class ProblemShape_, class CollectiveMainloop_,
+          class CollectiveEpilogue_, class TileScheduler_>
+class GemmUniversal<
+    ProblemShape_, CollectiveMainloop_, CollectiveEpilogue_, TileScheduler_,
+    cute::enable_if_t<cute::is_base_of_v<
+        KernelXePtrArrayCooperative,
+        typename CollectiveMainloop_::DispatchPolicy::Schedule>>> {
+ public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(
+      cute::rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or
+          cute::rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+      "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::WorkgroupTileShape;
+  using WorkgroupTileShape = TileShape;
+  using TiledMma = typename CollectiveMainloop::TiledMma;
+  using ArchTag = typename CollectiveMainloop::ArchTag;
+  using ElementA = typename CollectiveMainloop::ElementA;
+  using StrideA = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB = typename CollectiveMainloop::ElementB;
+  using StrideB = typename CollectiveMainloop::StrideB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_same_v<TileScheduler_, GroupScheduler>,
+                "Only Group Scheduler is supported with this code.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler =
+      typename detail::TileSchedulerSelector<TileScheduler_, ArchTag, TileShape,
+                                             ClusterShape, 0,
+                                             ProblemShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr int SubgroupSize =
+      CollectiveMainloop::SubgroupSize;  // sub_group size
+  static constexpr uint32_t MaxThreadsPerBlock =
+      CollectiveMainloop::MaxThreadsPerBlock;
+  using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape;
+  using SubgroupTileShape = typename CollectiveMainloop::SubgroupTileShape;
+
+  using MainloopTensors = typename CollectiveMainloop::MainloopTensors;
+  using EpilogueTensors = typename CollectiveEpilogue::EpilogueTensors;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+    EpilogueTensorStorage epilogue;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  static_assert(cute::is_same_v<ClusterShape, cute::Shape<_1, _1, _1>>);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args,
+                                        void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST(
+          "  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments "
+          "KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(
+          args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST(
+        "to_underlying_arguments(): Setting persistent grid SM count to "
+        << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+        problem_shape, TileShape{}, ClusterShape{}, hw_info, args.scheduler,
+        workspace_ptr);
+
+    return {args.mode,
+            problem_shape,
+            CollectiveMainloop::to_underlying_arguments(
+                args.problem_shape, args.mainloop, workspace_ptr),
+            CollectiveEpilogue::to_underlying_arguments(
+                args.problem_shape, args.epilogue, workspace_ptr),
+            hw_info,
+            scheduler,
+            workspace};
+  }
+
+  static bool can_implement(Arguments const& args) {
+    bool implementable = true;
+
+    implementable =
+        implementable &&
+        (args.mode == GemmUniversalMode::kGrouped ||
+         (args.mode == GemmUniversalMode::kBatched &&
+          rank(typename ProblemShape::UnderlyingProblemShape{}) == 3));
+
+    implementable =
+        implementable && TileScheduler::can_implement(args.scheduler);
+
+    implementable &=
+        CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &=
+        CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+
+    return implementable;
+  }
+
+  static size_t get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += TileScheduler::template get_workspace_size<
+        typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+        args.scheduler, typename ProblemShape::UnderlyingProblemShape{},
+        args.hw_info, -1);
+    return workspace_size;
+  }
+
+  static cutlass::Status initialize_workspace(
+      Arguments const& args, void* workspace = nullptr,
+      cudaStream_t stream = nullptr, CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+
+    status = TileScheduler::template initialize_workspace<
+        typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+        args.scheduler, workspace_ptr, stream,
+        typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, -1);
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3 get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread
+    // blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    args.raster_order =
+        params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN
+            ? TileScheduler::RasterOrderOptions::AlongN
+            : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape,
+                                         TileShape{}, ClusterShape{},
+                                         params.hw_info, args);
+  }
+
+  static dim3 get_block_shape() { return dim3(MaxThreadsPerBlock, 1, 1); }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char* smem_buf) {
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<WorkgroupTileShape>::value);
+
+    static_assert(cute::rank(InternalStrideA{}) == 3,
+                  "StrideA must be rank-3: [M, K, L]. If batch mode is not "
+                  "needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideB{}) == 3,
+                  "StrideB must be rank-3: [N, K, L]. If batch mode is not "
+                  "needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideC{}) == 3,
+                  "StrideC must be rank-3: [M, N, L]. If batch mode is not "
+                  "needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideD{}) == 3,
+                  "StrideD must be rank-3: [M, N, L]. If batch mode is not "
+                  "needed, set L stride to Int<0>.");
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    TileScheduler scheduler{params.scheduler};
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+    constexpr auto workgroup_shape =
+        WorkgroupTileShape{};  // (BLK_M,BLK_N,BLK_K)
+
+    int thread_idx = int(ThreadIdxX());
+    constexpr auto subgroup_shape = SubgroupTileShape{};  // (SUB_M,SUB_N,SUB_K)
+    bool did_group_change = true;
+    int32_t curr_group = -1;
+    using ProblemShapeMNKL = Shape<int, int, int, int>;
+    ProblemShapeMNKL problem_shape_MNKL;
+    MainloopTensors AB_tensors;
+    EpilogueTensors CD_tensors;
+
+    if (work_tile_info.is_valid()) {
+      curr_group = work_tile_info.L_idx;
+      problem_shape_MNKL =
+          append<4>(params.problem_shape.get_problem_shape(curr_group), 1);
+    }
+
+    while (work_tile_info.is_valid()) {
+      auto M = get<0>(problem_shape_MNKL);
+      auto N = get<1>(problem_shape_MNKL);
+      auto K = get<2>(problem_shape_MNKL);
+      auto L = get<3>(problem_shape_MNKL);
+
+      Tensor mA_mkl = cute::get_xe_tensor(make_shape(M, K, L));  //(m,k,l)
+      Tensor mB_nkl = cute::get_xe_tensor(make_shape(N, K, L));  //(n,k,l)
+
+      auto m_coord = work_tile_info.M_idx;
+      auto n_coord = work_tile_info.N_idx;
+
+      auto gA_mkl = local_tile(mA_mkl, select<0, 2>(workgroup_shape),
+                               make_coord(m_coord, _, 0));
+      auto gB_nkl = local_tile(mB_nkl, select<1, 2>(workgroup_shape),
+                               make_coord(n_coord, _, 0));
+
+      CollectiveMainloop collective_mma;
+      if (did_group_change) {
+        AB_tensors = collective_mma.update_tensor_shape_stride(
+            params.mainloop, curr_group, problem_shape_MNKL);
+      }
+      auto tile_coord = make_coord(m_coord, n_coord, _, 0);
+
+      // Get the number of K tiles to compute for this work as well as the
+      // starting K tile offset of the work.
+      int work_k_tile_count = TileScheduler::get_work_k_tile_count(
+          work_tile_info, problem_shape_MNKL, workgroup_shape);
+      int work_k_tile_start =
+          TileScheduler::get_work_k_tile_start(work_tile_info);
+      auto k_tile_iter = cute::make_coord_iterator(
+          idx2crd(work_k_tile_start, make_shape(K)), make_shape(K));
+
+      TiledMma tiled_mma;
+      Tensor accumulators =
+          partition_fragment_C(tiled_mma, take<0, 2>(workgroup_shape));
+
+      // Perform the collective scoped MMA
+      collective_mma(accumulators, gA_mkl, gB_nkl, accumulators, k_tile_iter,
+                     work_k_tile_count, tile_coord, K, thread_idx,
+                     params.mainloop, AB_tensors);
+
+      TileScheduler::fixup(params.scheduler, work_tile_info, accumulators, -1,
+                           -1);
+
+      if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+        CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
+
+        if (did_group_change) {
+          CD_tensors = epilogue.update_tensor_shape_stride(curr_group,
+                                                           problem_shape_MNKL);
+          did_group_change = false;
+        }
+
+        epilogue(problem_shape_MNKL, subgroup_shape, tile_coord, accumulators,
+                 tiled_mma, thread_idx, CD_tensors);
+      }
+
+      // Get next work tile
+      auto [next_work_tile_info, temp] =
+          scheduler.fetch_next_work(work_tile_info);
+      work_tile_info = next_work_tile_info;
+
+      did_group_change = curr_group != work_tile_info.L_idx;
+
+      if (did_group_change && work_tile_info.is_valid()) {
+        curr_group = work_tile_info.L_idx;
+        problem_shape_MNKL =
+            append<4>(params.problem_shape.get_problem_shape(curr_group), 1);
+      }
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::kernel
diff --git a/csrc/xpu/cutlass_kernels/grouped_gemm.hpp b/csrc/xpu/cutlass_kernels/grouped_gemm.hpp
new file mode 100644
index 0000000..1af7283
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/grouped_gemm.hpp
@@ -0,0 +1,35 @@
+
+
+// #include <sycl/sycl.hpp>
+// #include <cassert>
+// #include <vector>
+
+// #include <ATen/Tensor.h>
+/* #include "pytorch_shim.h" */
+
+#include <torch/all.h>
+#include "utils.h"
+
+namespace gpu::cutlass_kernel {
+
+namespace grouped_gemm {
+void kernel_functor(sycl::queue& stream, void* ptr_A, void* ptr_B, void* ptr_D,
+                    void* ptr_alpha, void* ptr_beta, void* offset, int64_t N,
+                    int64_t K, int64_t groups);
+}
+
+/* gemm2(group_A, w2, output, offset) */
+
+at::Tensor grouped_gemm_func(at::Tensor& ptr_A, at::Tensor& ptr_B,
+                             at::Tensor& ptr_D, at::Tensor& ptr_alpha,
+                             at::Tensor& ptr_beta, at::Tensor& offset,
+                             int64_t N, int64_t K, int64_t groups) {
+  auto& dpcpp_queue = vllm::xpu::vllmGetQueue();
+  grouped_gemm::kernel_functor(dpcpp_queue, ptr_A.data_ptr(), ptr_B.data_ptr(),
+                               ptr_D.data_ptr(), ptr_alpha.data_ptr(),
+                               ptr_beta.data_ptr(), offset.data_ptr(), N, K,
+                               groups);
+  return ptr_D;
+}
+
+}  // namespace gpu::cutlass_kernel
diff --git a/csrc/xpu/cutlass_kernels/grouped_gemm_kernel.cpp b/csrc/xpu/cutlass_kernels/grouped_gemm_kernel.cpp
new file mode 100644
index 0000000..4d0c894
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/grouped_gemm_kernel.cpp
@@ -0,0 +1,471 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUTLASS Intel BMG Group Gemm
+
+    This file is almost a complete copy of 04_bmg_grouped_gemm,
+    except that it's used for FP8 (E5M2 & E4M3) datatype inputs.
+
+    This example demonstrates fusing multiple GEMM operations into one kernel.
+
+    Note that the scalar arguments to e.g. the standard 00_bmg_gemm example,
+   have been replaced with vector equivalents, as each individual GEMM has its
+   own inputs and outputs, which needn't be contiguous in memory. For example,
+   where 00_bmg_gemm receives an `ElementA *` defining Matrix A, grouped gemm
+   receives a `ElementA **`, i.e. a pointer to pointers, each pointing to a
+   distinct Matrix A. Likewise, each individual GEMM operation may have its own
+   alpha and beta factors for linear combination. This example demonstrates two
+   approaches: the user can provide `options.alpha` and `options.beta`, in which
+   case they will apply to all GEMMs; otherwise, random values are generated per
+   GEMM.
+
+    Group GEMM scheduling (cutlass::gemm::GroupScheduler) is more complex than
+   standard GEMM, because each GEMM may have a unique size, only known at
+   runtime. Thus, the scheduler will distribute an a priori unknown number of
+   tiles to each work-group. See
+    include/cutlass/gemm/kernel/xe_gemm_array_cooperative.hpp for
+   implementation.
+
+    Note that for simplicity, this example sets every GEMM in the group to the
+   same shape.
+
+    Verification for this example is a conventional GEMM kernel, executed
+   iteratively per group.
+
+    To build & run this example (from your build dir):
+
+      $ ninja 09_bmg_grouped_gemm_fp8
+      $ ./examples/sycl/09_bmg_grouped_gemm_fp8/09_bmg_grouped_gemm_fp8
+
+    Call with `--help` for information about available options.
+
+    Note: the code may spill registers once compiled which will result in
+   sub-optimal performance. This is because of an issue inside Intel Graphics
+   Compiler (IGC) related to VectorAliasBBThreshold being debugged internally.
+    To avoid register spills, build the example by setting the environment
+   variable: $ export IGC_VectorAliasBBThreshold=10000
+*/
+
+#pragma once
+
+// #include "cutlass/epilogue/collective/default_epilogue.hpp"
+// #include "cutlass/epilogue/collective/xe_array_epilogue.hpp"
+// #include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+// #include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+// #include "cutlass/gemm/device/gemm_universal.h"
+// #include "cutlass/gemm/device/gemm_universal_adapter.h"
+// #include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "helper.h"
+#include <cfloat>
+
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "./collective/gemm/gemm_universal.h"
+#include "./collective/gemm/gemm_universal_adapter.h"
+#include "./collective/gemm/xe_array_mma.hpp"
+#include "./collective/gemm/xe_array_epilogue.hpp"
+#include "./collective/gemm/xe_builder.hpp"
+#include "./collective/gemm/xe_callbacks.hpp"
+// #include "./collective/gemm/xe_gemm_array_cooperative.hpp"
+// #include "./collective/gemm/gemm_universal_adapter.hpp"
+
+using namespace cute;
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;  // <M,N,K> per
+                                                             // group
+
+using ElementAccumulator = float;      // <- data type of accumulator
+using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+using ElementA = bfloat16_t;  // <- data type of elements in input matrix A
+using ElementB = bfloat16_t;  // <- data type of elements in input matrix B
+using ElementOutput =
+    bfloat16_t;  // <- data type of elements in output matrix D
+bool debug = false;
+bool collect_gflops = false;
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace gpu::cutlass_kernel {
+namespace grouped_gemm {
+
+struct Options {
+  bool error = false;
+  bool help = false;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k, groups;
+  std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
+
+  int num_of_expert;
+
+  Options(int64_t* offset, int N, int K, int ne)
+      : num_of_expert(ne),
+        n(N),
+        k(K),
+        error(false),
+        help(false),
+        alpha(FLT_MAX),
+        beta(FLT_MAX),
+        iterations(100) {
+    if (debug) {
+      std::cout << "Options()" << std::endl;
+    }
+    int group_cnt = 0;
+    // std::cout << "****Options() num_of_expert  " << num_of_expert <<
+    // std::endl;
+    for (int i = 0; i < num_of_expert; ++i) {
+      // std::cout << "****Options() i  " << i << std::endl;
+      // std::cout << "****Options() offset[i]  " << offset[i] << std::endl;
+      if (offset[i] != 0) {
+        group_cnt++;
+      }
+    }
+    // std::cout << "****Options() group_cnt  " << group_cnt << std::endl;
+    problem_sizes_host.reserve(group_cnt);
+    for (int i = 0; i < num_of_expert; ++i) {
+      if (offset[i] != 0) {
+        problem_sizes_host.push_back({static_cast<int>(offset[i]), n, k});
+      }
+    }
+    groups = group_cnt;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s,
+                std::vector<typename ProblemShape::UnderlyingProblemShape>
+                    problem_sizes_host) const {
+    // Number of real-valued multiply-adds
+    uint64_t fmas = uint64_t();
+
+    for (auto const& problem : problem_sizes_host) {
+      fmas += static_cast<uint64_t>(get<0>(problem)) *
+              static_cast<uint64_t>(get<1>(problem)) *
+              static_cast<uint64_t>(get<2>(problem));
+    }
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * uint64_t(fmas);
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+template <class Gemm>
+struct GroupedGemmRunner {
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementOutput = bfloat16_t;
+  using ElementAccumulator = float_t;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  std::vector<StrideA> stride_A_host;
+  std::vector<StrideB> stride_B_host;
+  std::vector<StrideC> stride_C_host;
+  std::vector<StrideD> stride_D_host;
+
+  // Device-side allocations
+  cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape>
+      problem_sizes;
+
+  cutlass::DeviceAllocation<StrideA> stride_A;
+  cutlass::DeviceAllocation<StrideB> stride_B;
+  cutlass::DeviceAllocation<StrideC> stride_C;
+  cutlass::DeviceAllocation<StrideD> stride_D;
+
+  void release() {
+    problem_sizes.release();
+    // ptr_C.release();
+    stride_A.release();
+    stride_B.release();
+    stride_C.release();
+    stride_D.release();
+    // block_C.release();
+  }
+
+  /// Allocates device-side data
+  void allocate(const Options& options) {
+    if (debug) {
+      std::cout << "void allocate()" << std::endl;
+    }
+    for (int32_t i = 0; i < options.groups; ++i) {
+      auto problem = options.problem_sizes_host.at(i);
+      auto M = get<0>(problem);
+      auto N = get<1>(problem);
+      auto K = get<2>(problem);
+
+      stride_A_host.push_back(
+          cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1}));
+      stride_B_host.push_back(
+          cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1}));
+      stride_C_host.push_back(
+          cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1}));
+      stride_D_host.push_back(
+          cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1}));
+    }
+  }
+
+  void initialize(const Options& options) {
+    if (debug) {
+      std::cout << "void initialize()" << std::endl;
+    }
+    problem_sizes.reset(options.groups);
+    problem_sizes.copy_from_host(options.problem_sizes_host.data());
+
+    stride_A.reset(options.groups);
+    stride_A.copy_from_host(stride_A_host.data());
+
+    stride_B.reset(options.groups);
+    stride_B.copy_from_host(stride_B_host.data());
+
+    stride_C.reset(options.groups);
+    stride_C.copy_from_host(stride_C_host.data());
+
+    stride_D.reset(options.groups);
+    stride_D.copy_from_host(stride_D_host.data());
+  }
+
+  /// Populates a Gemm::Arguments structure from the given commandline options
+  typename Gemm::Arguments args_from_options(
+      const Options& options, const cutlass::KernelHardwareInfo& hw_info,
+      const ElementA** ptr_A, const ElementB** ptr_B, ElementOutput** ptr_D,
+      ElementAccumulator** ptr_alpha, ElementAccumulator** ptr_beta,
+      bool host_problem_shapes_available = true) {
+    typename Gemm::Arguments arguments;
+    decltype(arguments.epilogue.thread) fusion_args;
+
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ
+    // between batches/groups.
+    fusion_args.alpha = 0;
+    fusion_args.beta = 0;
+    fusion_args.alpha_ptr = nullptr;
+    fusion_args.beta_ptr = nullptr;
+    fusion_args.alpha_ptr_array = ptr_alpha;
+    fusion_args.beta_ptr_array = ptr_beta;
+    // One alpha and beta per each group
+    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
+    fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
+    using RasterOrderOptions =
+        typename cutlass::gemm::kernel::detail::PersistentTileSchedulerXeGroup<
+            ProblemShape>::RasterOrderOptions;
+
+    // Per-GEMM problem shape info may only exist on the device.
+    if (host_problem_shapes_available) {
+      arguments = typename Gemm::Arguments{
+          cutlass::gemm::GemmUniversalMode::kGrouped,
+          {options.groups, problem_sizes.get(),
+           options.problem_sizes_host.data()},
+          {ptr_A, stride_A.get(), ptr_B, stride_B.get()},
+          {fusion_args, nullptr, stride_C.get(), ptr_D, stride_D.get()},
+          hw_info,
+          {1, RasterOrderOptions::AlongN}};
+    } else {
+      arguments = typename Gemm::Arguments{
+          cutlass::gemm::GemmUniversalMode::kGrouped,
+          {options.groups, problem_sizes.get(), nullptr},
+          {ptr_A, stride_A.get(), ptr_B, stride_B.get()},
+          {fusion_args, nullptr, stride_C.get(), ptr_D, stride_D.get()},
+          hw_info,
+          {1, RasterOrderOptions::AlongN}};
+    }
+
+    return arguments;
+  }
+
+  cutlass::Status run(const Options& options, sycl::queue& stream,
+                      const cutlass::KernelHardwareInfo& hw_info,
+                      const ElementA** ptr_A, const ElementB** ptr_B,
+                      ElementOutput** ptr_D, ElementAccumulator** ptr_alpha,
+                      ElementAccumulator** ptr_beta) {
+    if (debug) {
+      std::cout << "enter run" << std::endl;
+    }
+
+    allocate(options);
+    initialize(options);
+    Gemm gemm_op;
+
+    auto arguments = args_from_options(options, hw_info, ptr_A, ptr_B, ptr_D,
+                                       ptr_alpha, ptr_beta, true);
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    CUTLASS_CHECK(gemm_op.can_implement(arguments));
+
+    CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
+
+    if (debug) {
+      std::cout << "before run kernel" << std::endl;
+    }
+    // Run the GEMM
+
+    GPU_Clock timer;
+    timer.start();
+    CUTLASS_CHECK(gemm_op.run(stream));
+    if (collect_gflops) {
+      stream.wait();
+      float cute_time = timer.seconds() * 1000;
+      double cute_average_time = double(cute_time) / double(1);
+      std::cout << "  Avg runtimei : " << cute_average_time << " ms"
+                << std::endl;
+    }
+
+    if (collect_gflops) {
+      std::cout << "collect_gflops:" << collect_gflops << std::endl;
+      GPU_Clock timer;
+      timer.start();
+      for (int iter = 0; iter < 100; ++iter) {
+        CUTLASS_CHECK(gemm_op.run(stream));
+      }
+      stream.wait();
+      float cute_time = timer.seconds() * 1000;
+      double cute_average_time = double(cute_time) / double(options.iterations);
+      double gflops = options.gflops(cute_average_time / 1000.0,
+                                     options.problem_sizes_host);
+      std::cout << "  Avg runtime : " << cute_average_time << " ms"
+                << std::endl;
+      std::cout << "  GFLOPS      : " << gflops << std::endl;
+    }
+    stream.throw_asynchronous();
+    release();
+    return cutlass::Status::kSuccess;
+  }
+};
+
+void kernel_functor(sycl::queue& stream, void* ptr_A, void* ptr_B, void* ptr_D,
+                    void* ptr_alpha, void* ptr_beta, void* offset, int64_t N,
+                    int64_t K, int64_t groups) {
+  //
+  // Run examples
+  //
+  auto offset_ptr = reinterpret_cast<int64_t*>(offset);
+  Options options(offset_ptr, N, K, groups);
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a
+  // given device ID. This information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with
+  // multiple GPUs and wish to use a GPU other than that with device ID 0.
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = float;
+  using ElementA = cutlass::bfloat16_t;
+  using ElementB = cutlass::bfloat16_t;
+  using ElementOutput = bfloat16_t;
+  using ElementScale = cutlass::bfloat16_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using TileShape = Shape<_256, _256, _32>;
+  using GmemTiledCopyA =
+      XE_2D_U16x32x32_LD_N;  // Note: This shape has to match the shape used for
+                             // the scaling factors
+  using GmemTiledCopyB =
+      XE_2D_U16x32x32_LD_V;  // Note: This shape has to match the shape used for
+                             // the scaling factors
+
+  using TiledMma =
+      TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
+               Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>,
+               Tile<Layout<Shape<_8, _8, _4>, Stride<_1, _32, _8>>,
+                    Layout<Shape<_16, _4, _4>, Stride<_1, _64, _16>>, _32>>;
+
+  constexpr int PipelineStages = 2;
+  using GEMMDispatchPolicy =
+      cutlass::gemm::MainloopIntelXeXMX16Group<PipelineStages>;
+  using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16Group;
+  using EpilogueOp =
+      cutlass::epilogue::fusion::LinearCombination<float_t, float_t>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::IntelXe, cutlass::arch::OpClassTensorOp, TileShape,
+          Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueTileAuto,
+          float, float, float, LayoutC, 1, ElementOutput, LayoutC, 1,
+          EpilogueDispatchPolicy, EpilogueOp>::CollectiveOp;
+
+  // Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+      GEMMDispatchPolicy, TileShape, ElementA,
+      cutlass::gemm::TagToStrideA_t<LayoutA*>, ElementB,
+      cutlass::gemm::TagToStrideB_t<LayoutB*>, TiledMma, GmemTiledCopyA, void,
+      void, cute::identity,                       // A
+      GmemTiledCopyB, void, void, cute::identity  // B
+      >;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue,
+                                           cutlass::gemm::GroupScheduler>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  GroupedGemmRunner<Gemm> runner;
+  runner.run(options, stream, hw_info,
+             reinterpret_cast<const ElementA**>(ptr_A),
+             reinterpret_cast<const ElementB**>(ptr_B),
+             reinterpret_cast<ElementOutput**>(ptr_D),
+             reinterpret_cast<ElementAccumulator**>(ptr_alpha),
+             reinterpret_cast<ElementAccumulator**>(ptr_beta));
+}
+
+}  // namespace grouped_gemm
+}  // namespace gpu::cutlass_kernel
diff --git a/csrc/xpu/cutlass_kernels/helper.h b/csrc/xpu/cutlass_kernels/helper.h
new file mode 100644
index 0000000..8b90bab
--- /dev/null
+++ b/csrc/xpu/cutlass_kernels/helper.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#if defined(CUTLASS_ENABLE_SYCL)
+  #include "cutlass/util/sycl_timer.hpp"
+#else
+  #include <cuda_runtime.h>
+#endif
+#include <iostream>
+
+/**
+ * Panic wrapper for unwinding CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                                             \
+  {                                                                       \
+    cutlass::Status error = status;                                       \
+    if (error != cutlass::Status::kSuccess) {                             \
+      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) \
+                << " at: " << __LINE__ << std::endl;                      \
+      exit(EXIT_FAILURE);                                                 \
+    }                                                                     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                              \
+  {                                                                     \
+    cudaError_t error = status;                                         \
+    if (error != cudaSuccess) {                                         \
+      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                << " at line: " << __LINE__ << std::endl;               \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  }
+
+/**
+ * GPU timer for recording the elapsed time across kernel(s) launched in GPU
+ * stream
+ */
+struct GpuTimer {
+#if defined(CUTLASS_ENABLE_SYCL)
+  using cudaStream_t = int;
+  SYCLTimer syclTimer;
+#else
+  cudaEvent_t _start;
+  cudaEvent_t _stop;
+#endif
+  cudaStream_t _stream_id;
+
+  /// Constructor
+  GpuTimer() : _stream_id(0) {
+#if !defined(CUTLASS_ENABLE_SYCL)
+    CUDA_CHECK(cudaEventCreate(&_start));
+    CUDA_CHECK(cudaEventCreate(&_stop));
+#endif
+  }
+
+  /// Destructor
+  ~GpuTimer() {
+#if !defined(CUTLASS_ENABLE_SYCL)
+    CUDA_CHECK(cudaEventDestroy(_start));
+    CUDA_CHECK(cudaEventDestroy(_stop));
+#endif
+  }
+
+  /// Start the timer for a given stream (defaults to the default stream)
+  void start(cudaStream_t stream_id = 0) {
+    _stream_id = stream_id;
+#if defined(CUTLASS_ENABLE_SYCL)
+    syclTimer.start();
+#else
+    CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+#endif
+  }
+
+  /// Stop the timer
+  void stop() {
+#if defined(CUTLASS_ENABLE_SYCL)
+    syclTimer.stop();
+#else
+    CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
+#endif
+  }
+
+  /// Return the elapsed time (in milliseconds)
+  float elapsed_millis() {
+#if defined(CUTLASS_ENABLE_SYCL)
+    return syclTimer.milliseconds();
+#else
+    float elapsed = 0.0;
+    CUDA_CHECK(cudaEventSynchronize(_stop));
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
+    return elapsed;
+#endif
+  }
+};
diff --git a/csrc/xpu/ops.h b/csrc/xpu/ops.h
index 524199e..471526e 100644
--- a/csrc/xpu/ops.h
+++ b/csrc/xpu/ops.h
@@ -7,6 +7,11 @@ torch::Tensor fp8_gemm_w8a16(const torch::Tensor& A, const torch::Tensor& B,
                              const std::optional<torch::Tensor>& B_scale_,
                              const std::optional<torch::Tensor>& bias_);
 
+torch::Tensor cutlass_grouped_gemm(torch::Tensor ptr_A, torch::Tensor ptr_B,
+                                   torch::Tensor ptr_D, torch::Tensor ptr_alpha,
+                                   torch::Tensor ptr_beta, torch::Tensor offset,
+                                   int64_t N, int64_t K, int64_t groups);
+
 std::tuple<at::Tensor, at::Tensor> deepseek_scaling_rope(
     const at::Tensor& positions, const at::Tensor& query, const at::Tensor& key,
     const c10::optional<at::Tensor>& offsets_opt,
diff --git a/csrc/xpu/torch_bindings.cpp b/csrc/xpu/torch_bindings.cpp
index 8ce6109..a81c242 100644
--- a/csrc/xpu/torch_bindings.cpp
+++ b/csrc/xpu/torch_bindings.cpp
@@ -1,5 +1,6 @@
 #include "core/registration.h"
 #include "xpu/ops.h"
+#include "xpu/cutlass_kernels/grouped_gemm.hpp"
 #include "xpu/lora/lora_ops.h"
 
 #include <torch/library.h>
@@ -13,6 +14,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, xpu_ops) {
       "Tensor? bias_) -> Tensor");
   xpu_ops.impl("fp8_gemm_w8a16", torch::kXPU, &fp8_gemm_w8a16);
 
+  xpu_ops.def(
+      "cutlass_grouped_gemm(Tensor ptr_A, Tensor ptr_B, Tensor ptr_D, Tensor "
+      "ptr_alpha, Tensor ptr_beta, Tensor offset, int N, int K, int groups) -> "
+      "Tensor");
+  xpu_ops.impl("cutlass_grouped_gemm", torch::kXPU,
+               gpu::cutlass_kernel::grouped_gemm_func);
+
   xpu_ops.def(
       "deepseek_scaling_rope(Tensor! positions, Tensor! query, Tensor! key, "
       "Tensor? offsets_opt, Tensor! cos_sin_cache, int rotary_dim, bool "
diff --git a/tests/fused_moe/test_fused_moe.py b/tests/fused_moe/test_fused_moe.py
new file mode 100644
index 0000000..ebb70fd
--- /dev/null
+++ b/tests/fused_moe/test_fused_moe.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+import random
+
+import pytest
+import torch
+
+from tests.utils import seed_everything
+from vllm_xpu_kernels.fused_moe_interface import (cutlass_fused_moe,
+                                                  cutlass_grouped_gemm)
+
+DEVICE = "xpu"
+
+# shape for Llama-4-scout
+FUSED_MOE_MNK_FACTORS = [
+    (1, 5120, 8192),
+    (4, 5120, 8192),
+    (16, 5120, 8192),
+    (8192, 5120, 8192),
+]
+NUM_EXPERTS = [16]
+TOP_KS = [1]
+
+
+def random_partition(size_a: int, target: int):
+    cuts = sorted(random.sample(range(target + size_a - 1), size_a - 1))
+    cuts = [-1] + cuts + [target + size_a - 1]
+    result = [cuts[i + 1] - cuts[i] - 1 for i in range(size_a)]
+    return result
+
+
+@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_grouped_gemm(m, n, k, e, topk, dtype):
+    seed_everything(7)
+    num_experts = e
+    token_per_group = random_partition(e, m * topk)
+    print(token_per_group)
+    # input
+    input_A = torch.randn((sum(token_per_group), k),
+                          dtype=dtype,
+                          device=DEVICE).contiguous()
+    ref_A = input_A.clone()
+    # weight
+    input_B = torch.randn((num_experts, n, k), dtype=dtype, device=DEVICE)
+    input_B = input_B.transpose(-1, -2).contiguous().transpose(-1, -2)
+
+    # output offset
+    output = torch.empty((sum(token_per_group), n), dtype=dtype, device=DEVICE)
+    cutlass_grouped_gemm(input_A, input_B, output, token_per_group, n, k,
+                         num_experts)
+    torch.xpu.synchronize()
+    # ref gg
+    ref = []
+    pre_token_sum = 0
+    for i in range(num_experts):
+        cur_token_num = token_per_group[i]
+        if cur_token_num == 0:
+            continue
+        input = ref_A[pre_token_sum:pre_token_sum + cur_token_num, :]
+        weight = input_B[i, :, :]
+        expert_output = input @ weight.T
+        ref.append(expert_output)
+        pre_token_sum += cur_token_num
+    ref = torch.cat(ref, dim=0)
+
+    try:
+        torch.testing.assert_close(output, ref, rtol=1e-2, atol=1e-2)
+        print("a and b close enough")
+    except AssertionError as e:
+        print("a and b diffs")
+        print(e)
+
+
+def ref_fused_moe(x, w13, w2, flat_expert_weights, flat_expert_indices,
+                  num_per_tok, activation, num_experts):
+    expert_cache = torch.zeros_like(x)
+    idxs = flat_expert_indices.argsort()
+    counts = flat_expert_indices.bincount().cpu().numpy()
+    tokens_per_expert = counts.cumsum()
+    token_idxs = idxs // num_per_tok
+    for expert_id, end_idx in enumerate(tokens_per_expert):
+        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+        if start_idx == end_idx:
+            continue
+
+        exp_token_idxs = token_idxs[start_idx:end_idx]
+        expert_tokens = x[exp_token_idxs]
+
+        expert_w13 = w13[expert_id, :, :]
+        w1, w3 = torch.split(expert_w13,
+                             int(list(expert_w13.shape)[0] / 2),
+                             dim=0)
+        act_fn = torch.nn.SiLU()
+        gemm1 = expert_tokens @ w1.T
+        gate = act_fn(gemm1)
+        up = expert_tokens @ w3.T
+        expert_out = (gate * up) @ w2[expert_id, :, :].T
+        expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+        expert_cache.scatter_reduce_(0,
+                                     exp_token_idxs.view(-1, 1).repeat(
+                                         1, x.shape[-1]),
+                                     expert_out,
+                                     reduce='sum')
+
+    return expert_cache
+
+
+def check_fused_moe(
+    m: int,  # num of tokens
+    n: int,  # intermediate_size
+    k: int,  # hidden_size
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+):
+    seed_everything(7)
+    verbose = False
+    # Setup test data
+    a = torch.randn((m, k), device=DEVICE, dtype=dtype) / 10
+    w13 = torch.randn((e, 2 * n, k), device=DEVICE, dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device=DEVICE, dtype=dtype) / 10
+    ref_a = a.clone()
+
+    # moe gate
+    scores = torch.randn((m, e), device=DEVICE, dtype=dtype)
+    expert_scores, expert_indices = torch.topk(scores,
+                                               k=topk,
+                                               dim=-1,
+                                               sorted=False)
+
+    if verbose:
+        print("expert_indices: ", expert_indices, expert_indices.shape)
+        print("expert_scores: ", expert_scores, expert_scores.shape)
+
+    flat_expert_indices = expert_indices.view(-1)
+    flat_expert_weights = expert_scores.view(-1, 1)
+
+    iteration = 1
+    for _ in range(iteration):
+        out = cutlass_fused_moe(hidden_states=a,
+                                w13=w13,
+                                w2=w2,
+                                topk_weights=flat_expert_weights,
+                                topk_ids=flat_expert_indices,
+                                n_experts_per_token=topk,
+                                activation="silu",
+                                num_experts=e)
+
+    ref_out = ref_fused_moe(ref_a, w13, w2, flat_expert_weights,
+                            flat_expert_indices, topk, "silu", e)
+
+    print("ref result", ref_out, ref_out.shape)
+    print("kernel result", out, out.shape)
diff --git a/vllm_xpu_kernels/fused_moe_interface.py b/vllm_xpu_kernels/fused_moe_interface.py
new file mode 100644
index 0000000..eaeb2ff
--- /dev/null
+++ b/vllm_xpu_kernels/fused_moe_interface.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+try:
+    from . import _xpu_C  # noqa: F401
+    FUSEDMOE_UNAVAILABLE_REASON = None
+    FUSEDMOE_AVAILABLE = True
+except ImportError as e:
+    FUSEDMOE_UNAVAILABLE_REASON = str(e)
+    FUSEDMOE_AVAILABLE = False
+
+
+def prepare_gemm_args(n, k, offset, A, B, D, alpha, beta, e):
+
+    if not hasattr(prepare_gemm_args, "gemm_args"):
+        gemm_args = {}
+        device = A.device
+        ptr_A = torch.empty(e * 8, dtype=torch.uint8,
+                            device=device).contiguous()
+        ptr_B = torch.empty(e * 8, dtype=torch.uint8,
+                            device=device).contiguous()
+        ptr_D = torch.empty(e * 8, dtype=torch.uint8,
+                            device=device).contiguous()
+        ptr_alpha = torch.empty(e * 8, dtype=torch.uint8,
+                                device=device).contiguous()
+        ptr_beta = torch.empty(e * 8, dtype=torch.uint8,
+                               device=device).contiguous()
+        gemm_args["ptr_A"] = ptr_A
+        gemm_args["ptr_B"] = ptr_B
+        gemm_args["ptr_D"] = ptr_D
+        gemm_args["ptr_alpha"] = ptr_alpha
+        gemm_args["ptr_beta"] = ptr_beta
+        prepare_gemm_args.gemm_args = gemm_args
+
+    ptr_A = prepare_gemm_args.gemm_args["ptr_A"]
+    ptr_B = prepare_gemm_args.gemm_args["ptr_B"]
+    ptr_D = prepare_gemm_args.gemm_args["ptr_D"]
+    ptr_alpha = prepare_gemm_args.gemm_args["ptr_alpha"]
+    ptr_beta = prepare_gemm_args.gemm_args["ptr_beta"]
+    total_elements_A = 0
+    total_elements_D = 0
+
+    def process_data_ptr(tensor, offset, addr_tensor, dim, group):
+        if dim == 1:
+            addr = tensor[offset].data_ptr()
+        elif dim == 2:
+            addr = tensor[offset, :].data_ptr()
+        elif dim == 3:
+            addr = tensor[offset, :, :].data_ptr()
+        for i in range(8):  # 64bit -> 8 bytes
+            byte_val = (addr >> (i * 8)) & 0xFF
+            addr_tensor[8 * group + i] = byte_val
+
+    groups = 0
+    for expert_i, m in enumerate(offset):
+        if m != 0:
+            # problem_sizes.extend([m, n, k])
+            process_data_ptr(A, total_elements_A, ptr_A, 2, groups)
+            process_data_ptr(B, expert_i, ptr_B, 3, groups)
+            process_data_ptr(D, total_elements_D, ptr_D, 2, groups)
+            process_data_ptr(alpha, groups, ptr_alpha, 1, groups)
+            process_data_ptr(beta, groups, ptr_beta, 1, groups)
+            total_elements_A += m
+            total_elements_D += m
+            groups += 1
+
+    prepare_gemm_args.gemm_args["groups"] = e  # FIXME: groups
+    return prepare_gemm_args.gemm_args
+
+
+def cutlass_grouped_gemm(input_A, input_B, output, offset, n, k, num_experts):
+    alpha = torch.ones(num_experts, dtype=torch.float32, device=input_A.device)
+    beta = torch.zeros(num_experts, dtype=torch.float32, device=input_A.device)
+    gemm_args = prepare_gemm_args(n, k, offset, input_A, input_B, output,
+                                  alpha, beta, num_experts)
+    offset = torch.tensor(offset, dtype=torch.int64, device="cpu")
+    torch.ops._xpu_C.cutlass_grouped_gemm(offset=offset, N=n, K=k, **gemm_args)
+
+
+def cutlass_fused_moe(hidden_states, w13, w2, topk_weights, topk_ids,
+                      n_experts_per_token, activation, num_experts):
+
+    token_cnt, hidden_size = list(hidden_states.shape)
+    intermediate_size = list(w2.shape)[-1]
+    total_input_size = token_cnt * n_experts_per_token
+    if not hasattr(cutlass_fused_moe, "moe_buffer"):
+        moe_buffer = {}
+        moe_buffer["expert_cache"] = torch.empty((token_cnt * hidden_size),
+                                                 dtype=hidden_states.dtype,
+                                                 device=hidden_states.device)
+        moe_buffer["gemm1_input"] = torch.empty(
+            (total_input_size, hidden_size),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device)
+        moe_buffer["gemm1_output"] = torch.empty(
+            (total_input_size, 2 * intermediate_size),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device)
+        moe_buffer["gemm2_output"] = torch.empty(
+            (total_input_size, hidden_size),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device)
+        moe_buffer["alpha"] = torch.ones(num_experts,
+                                         dtype=torch.float32,
+                                         device=hidden_states.device)
+        moe_buffer["beta"] = torch.zeros(num_experts,
+                                         dtype=torch.float32,
+                                         device=hidden_states.device)
+
+        cutlass_fused_moe.moe_buffer = moe_buffer
+
+    expert_cache = cutlass_fused_moe.moe_buffer[
+        "expert_cache"][:hidden_states.numel()].view_as(hidden_states).zero_()
+    input_A = cutlass_fused_moe.moe_buffer["gemm1_input"][:total_input_size, :]
+    gemm1_output = cutlass_fused_moe.moe_buffer[
+        "gemm1_output"][:total_input_size, :]
+    gemm2_output = cutlass_fused_moe.moe_buffer[
+        "gemm2_output"][:total_input_size, :]
+    alpha = cutlass_fused_moe.moe_buffer["alpha"]
+    beta = cutlass_fused_moe.moe_buffer["beta"]
+
+    # map token to experts
+    idxs = topk_ids.argsort()
+    counts = topk_ids.to(torch.long).bincount().cpu().numpy()
+    tokens_per_expert = counts.cumsum()
+    num_per_tok = n_experts_per_token
+    token_idxs = idxs // num_per_tok
+    offset = []
+    for expert_id, end_idx in enumerate(tokens_per_expert):
+        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+        offset.append(end_idx - start_idx)
+        if start_idx == end_idx:
+            continue
+        exp_token_idxs = token_idxs[start_idx:end_idx]
+        # expert_tokens = hidden_states[exp_token_idxs]
+        # grouped_input_A.append(expert_tokens)
+        input_A[start_idx:end_idx, :].copy_(hidden_states[exp_token_idxs])
+
+    while len(offset) < num_experts:
+        offset.append(0)
+
+    ########### gemm1 ##################
+    input_B = w13.transpose(-1, -2).contiguous().transpose(-1, -2)
+    assert (list(input_A.shape)[0] == total_input_size)
+    gemm_args = prepare_gemm_args(2 * intermediate_size, hidden_size, offset,
+                                  input_A, input_B, gemm1_output, alpha, beta,
+                                  num_experts)
+    offset_t = torch.tensor(offset, dtype=torch.int64, device='cpu')
+    torch.ops._xpu_C.cutlass_grouped_gemm(offset=offset_t,
+                                          N=2 * intermediate_size,
+                                          K=hidden_size,
+                                          **gemm_args)
+    # act
+    gate, up_ = torch.split(gemm1_output, intermediate_size, dim=1)
+    act = torch.nn.SiLU()
+    act_output = act(gate) * up_
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2.transpose(-1, -2).contiguous().transpose(-1, -2)
+    gemm_args = prepare_gemm_args(hidden_size, intermediate_size, offset,
+                                  input_A, input_B, gemm2_output, alpha, beta,
+                                  num_experts)
+    torch.ops._xpu_C.cutlass_grouped_gemm(offset=offset_t,
+                                          N=hidden_size,
+                                          K=intermediate_size,
+                                          **gemm_args)
+
+    # apply scores
+    for expert_id, end_idx in enumerate(tokens_per_expert):
+        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+        if start_idx == end_idx:
+            continue
+
+        exp_token_idxs = token_idxs[start_idx:end_idx]
+        expert_out = gemm2_output[start_idx:end_idx]
+        expert_out.mul_(topk_weights[idxs[start_idx:end_idx]])
+        expert_cache.scatter_reduce_(0,
+                                     exp_token_idxs.view(-1, 1).repeat(
+                                         1, hidden_size),
+                                     expert_out,
+                                     reduce='sum')
+    return expert_cache