diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 09ed858f6a..ef584284f8 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -2,9 +2,9 @@ name: "CodeQL"
 
 on:
   push:
-    branches: [ "sycl-develop" ]
+    branches: [ "main" ]
   pull_request:
-    branches: [ "sycl-develop" ]
+    branches: [ "main" ]
   schedule:
     - cron: '23 5 * * 5'
 
@@ -79,3 +79,10 @@ jobs:
       uses: github/codeql-action/analyze@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8
       with:
         category: "/language:${{matrix.language}}"
+    
+    - name: Upload Sarif Artifact
+      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      with:
+        name: codeql-results-${{ matrix.language }}
+        path: ./results/${{ matrix.language }}.sarif
+        retention-days: 7
diff --git a/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm_bf16_output.cpp b/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm_bf16_output.cpp
new file mode 100644
index 0000000000..f1bbb477a9
--- /dev/null
+++ b/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm_bf16_output.cpp
@@ -0,0 +1,659 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2026 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUTLASS Intel BMG Group Gemm
+
+    This example demonstrates fusing multiple GEMM operations into one kernel.
+
+    Note that the scalar arguments to e.g. the standard 00_bmg_gemm example,
+   have been replaced with vector equivalents, as each individual GEMM has its
+   own inputs and outputs, which needn't be contiguous in memory. For example,
+   where 00_bmg_gemm receives an `ElementA *` defining Matrix A, grouped gemm
+   receives a `ElementA **`, i.e. a pointer to pointers, each pointing to a
+   distinct Matrix A. Likewise, each individual GEMM operation may have its own
+   alpha and beta factors for linear combination. This example demonstrates two
+   approaches: the user can provide `options.alpha` and `options.beta`, in which
+   case they will apply to all GEMMs; otherwise, random values are generated per
+   GEMM.
+
+    Group GEMM scheduling (cutlass::gemm::GroupScheduler) is more complex than
+   standard GEMM, because each GEMM may have a unique size, only known at
+   runtime. Thus, the scheduler will distribute an a priori unknown number of
+   tiles to each work-group. See
+    include/cutlass/gemm/kernel/xe_gemm_array_cooperative.hpp for
+   implementation.
+
+    Note that for simplicity, this example sets every GEMM in the group to the
+   same shape.
+
+    Verification for this example is a conventional GEMM kernel, executed
+   iteratively per group.
+
+    To build & run this example (from your build dir):
+
+      $ ninja 04_bmg_grouped_gemm
+      $ ./examples/sycl/04_bmg_grouped_gemm/04_bmg_grouped_gemm
+
+    Call with `--help` for information about available options.
+
+    Note: the code may spill registers once compiled which will result in
+   sub-optimal performance. This is because of an issue inside Intel Graphics
+   Compiler (IGC) related to VectorAliasBBThreshold being debugged internally.
+    To avoid register spills, build the example by setting the environment
+   variable: $ export IGC_VectorAliasBBThreshold=10000
+*/
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/xe_array_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "helper.h"
+#include "sycl_common.hpp"
+
+#include <cfloat>
+
+using namespace cute;
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<Shape<int, int, int>>; // <M,N,K> per group
+
+using ElementAccumulator = float;     // <- data type of accumulator
+using ElementComputeEpilogue = float; // <- data type of epilogue operations
+using ElementA = bfloat16_t;      // <- data type of elements in input matrix A
+using ElementB = bfloat16_t;      // <- data type of elements in input matrix B
+using ElementOutput = bfloat16_t; // <- data type of elements in output matrix D
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CUTLASS_SYCL_PROFILING_ENABLED
+
+// Command line options parsing
+struct Options {
+
+  bool error = false;
+  bool help = false;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k, groups;
+  std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
+
+  Options()
+      : error(false), help(false), alpha(FLT_MAX), beta(FLT_MAX),
+        iterations(100), m(5120), n(4096), k(4096), groups(2) {
+    problem_sizes_host.reserve(groups);
+    for (int i = 0; i < groups; i++) {
+      problem_sizes_host.push_back({m, n, k});
+    }
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 5120);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("groups", groups, 2);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+
+    assert(groups > 0);
+    problem_sizes_host.clear();
+    problem_sizes_host.reserve(groups);
+    for (int i = 0; i < groups; i++) {
+      problem_sizes_host.push_back({m, n, k});
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream &print_usage(std::ostream &out) const {
+
+    out << "BMG Grouped GEMM\n\n"
+        << "Options:\n\n"
+        << "  --help                      If specified, displays this usage "
+           "statement\n\n"
+        << "  --m=<int>                   Sets the M extent of the GEMM for "
+           "all groups\n"
+        << "  --n=<int>                   Sets the N extent of the GEMM for "
+           "all groups\n"
+        << "  --k=<int>                   Sets the K extent of the GEMM for "
+           "all groups\n"
+        << "  --groups=<int>              Sets the number of individual GEMM "
+           "problems for Grouped GEMM\n"
+        << "  --alpha=<f32>               Epilogue scalar alpha\n"
+        << "  --beta=<f32>                Epilogue scalar beta\n\n"
+        << "  --iterations=<int>          Number of profiling iterations to "
+           "perform\n\n";
+
+    out << "\n\nExamples:\n\n"
+        << "$ " << "bmg_grouped_gemm"
+        << " --m=5120 --n=4096 --k=4096 --groups=5 --alpha=2.5 --beta=0.5 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s,
+                std::vector<typename ProblemShape::UnderlyingProblemShape>
+                    problem_sizes_host) const {
+    // Number of real-valued multiply-adds
+    uint64_t fmas = uint64_t();
+
+    for (auto const &problem : problem_sizes_host) {
+      fmas += static_cast<uint64_t>(get<0>(problem)) *
+              static_cast<uint64_t>(get<1>(problem)) *
+              static_cast<uint64_t>(get<2>(problem));
+    }
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * uint64_t(fmas);
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Gemm> struct ExampleRunner {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementOutput = bfloat16_t;
+  using ElementAccumulator = float_t;
+
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  // Host-side allocations
+  std::vector<int64_t> offset_A;
+  std::vector<int64_t> offset_B;
+  std::vector<int64_t> offset_C;
+  std::vector<int64_t> offset_D;
+
+  std::vector<StrideA> stride_A_host;
+  std::vector<StrideB> stride_B_host;
+  std::vector<StrideC> stride_C_host;
+  std::vector<StrideD> stride_D_host;
+
+  std::vector<ElementAccumulator> alpha_host;
+  std::vector<ElementAccumulator> beta_host;
+
+  // Device-side allocations
+  cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape>
+      problem_sizes;
+
+  // This example defines all matrices in a single allocation (e.g. block_A),
+  // but this is not a requirement. Matrix base pointers are read from device
+  // allocation (e.g. ptr_A)
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  cutlass::DeviceAllocation<const ElementA *> ptr_A;
+  cutlass::DeviceAllocation<const ElementB *> ptr_B;
+  cutlass::DeviceAllocation<const ElementC *> ptr_C;
+  cutlass::DeviceAllocation<ElementOutput *> ptr_D;
+  cutlass::DeviceAllocation<ElementOutput *> ptr_ref_D;
+
+  cutlass::DeviceAllocation<StrideA> stride_A;
+  cutlass::DeviceAllocation<StrideB> stride_B;
+  cutlass::DeviceAllocation<StrideC> stride_C;
+  cutlass::DeviceAllocation<StrideD> stride_D;
+
+  // Note, this is an array of pointers to alpha and beta scaling values per
+  // group
+  cutlass::DeviceAllocation<ElementAccumulator *> alpha_device;
+  cutlass::DeviceAllocation<ElementAccumulator *> beta_device;
+  cutlass::DeviceAllocation<ElementAccumulator> block_alpha;
+  cutlass::DeviceAllocation<ElementAccumulator> block_beta;
+
+  uint64_t seed = 0;
+
+  //
+  // Methods
+  //
+
+  bool verify(const Options &options) {
+    bool passed = true;
+    // Verify against individual reference GEMMs
+    for (int32_t i = 0; i < options.groups; ++i) {
+      auto problem = options.problem_sizes_host.at(i);
+      auto M = get<0>(problem);
+      auto N = get<1>(problem);
+      auto K = get<2>(problem);
+      cutlass::TensorRef ref_A(block_A.get() + offset_A.at(i),
+                               LayoutA::packed({M, K}));
+      cutlass::TensorRef ref_B(block_B.get() + offset_B.at(i),
+                               LayoutB::packed({K, N}));
+      cutlass::TensorRef ref_C(block_C.get() + offset_C.at(i),
+                               LayoutC::packed({M, N}));
+      cutlass::TensorRef ref_D(block_ref_D.get() + offset_D.at(i),
+                               LayoutD::packed({M, N}));
+
+      //
+      // Compute reference output
+      //
+      cutlass::reference::device::GemmComplex(
+          {M, N, K}, alpha_host.at(i), ref_A, cutlass::ComplexTransform::kNone,
+          ref_B, cutlass::ComplexTransform::kNone, beta_host.at(i), ref_C,
+          ref_D, ElementAccumulator(0),
+          1,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+      );
+
+      // Wait for kernel to finish
+      syclcompat::wait();
+
+      // Check if output from CUTLASS kernel and reference kernel are equal or
+      // not
+      passed &= cutlass::reference::device::BlockCompareEqual(
+          block_ref_D.get() + offset_D.at(i), block_D.get() + offset_D.at(i),
+          M * N);
+      if (!passed)
+        break;
+    }
+    return passed;
+  }
+
+  /// Allocates device-side data
+  void allocate(const Options &options) {
+    int64_t total_elements_A = 0;
+    int64_t total_elements_B = 0;
+    int64_t total_elements_C = 0;
+    int64_t total_elements_D = 0;
+
+    // Compute total allocation sizes across group
+    for (int32_t i = 0; i < options.groups; ++i) {
+
+      auto problem = options.problem_sizes_host.at(i);
+      auto M = get<0>(problem);
+      auto N = get<1>(problem);
+      auto K = get<2>(problem);
+
+      // Offset into block allocation of each matrix base pointer
+      offset_A.push_back(total_elements_A);
+      offset_B.push_back(total_elements_B);
+      offset_C.push_back(total_elements_C);
+      offset_D.push_back(total_elements_D);
+
+      int64_t elements_A = M * K;
+      int64_t elements_B = K * N;
+      int64_t elements_C = M * N;
+      int64_t elements_D = M * N;
+
+      total_elements_A += elements_A;
+      total_elements_B += elements_B;
+      total_elements_C += elements_C;
+      total_elements_D += elements_D;
+
+      stride_A_host.push_back(
+          cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1}));
+      stride_B_host.push_back(
+          cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1}));
+      stride_C_host.push_back(
+          cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1}));
+      stride_D_host.push_back(
+          cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1}));
+    }
+
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+    block_ref_D.reset(total_elements_D);
+    block_alpha.reset(options.groups);
+    block_beta.reset(options.groups);
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const Options &options) {
+
+    uint64_t seed = 2020;
+
+    problem_sizes.reset(options.groups);
+    problem_sizes.copy_from_host(options.problem_sizes_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    std::vector<ElementA *> ptr_A_host(options.groups);
+    std::vector<ElementB *> ptr_B_host(options.groups);
+    std::vector<ElementC *> ptr_C_host(options.groups);
+    std::vector<ElementOutput *> ptr_D_host(options.groups);
+    std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
+    std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
+
+    // Compute offsets, alpha & beta over group on host
+    for (int32_t i = 0; i < options.groups; ++i) {
+      ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+      ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+      ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+      ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+      // Fill host vector of alpha & beta with random values if using per-group
+      // values
+      alpha_host.push_back(
+          (options.alpha == FLT_MAX)
+              ? static_cast<ElementAccumulator>((rand() % 5) + 1)
+              : options.alpha);
+      beta_host.push_back((options.beta == FLT_MAX)
+                              ? static_cast<ElementAccumulator>(rand() % 5)
+                              : options.beta);
+      // Fill host ptr vectors with offset addresses into device alpha/beta
+      // blocks
+      ptr_alpha_host.at(i) = block_alpha.get() + i;
+      ptr_beta_host.at(i) = block_beta.get() + i;
+    }
+
+    // Allocate device memory & copy from host
+    ptr_A.reset(options.groups);
+    // Per-group alpha and beta
+    ptr_A.copy_from_host(ptr_A_host.data());
+
+    ptr_B.reset(options.groups);
+    ptr_B.copy_from_host(ptr_B_host.data());
+
+    ptr_C.reset(options.groups);
+    ptr_C.copy_from_host(ptr_C_host.data());
+
+    ptr_D.reset(options.groups);
+    ptr_D.copy_from_host(ptr_D_host.data());
+
+    stride_A.reset(options.groups);
+    stride_A.copy_from_host(stride_A_host.data());
+
+    stride_B.reset(options.groups);
+    stride_B.copy_from_host(stride_B_host.data());
+
+    stride_C.reset(options.groups);
+    stride_C.copy_from_host(stride_C_host.data());
+
+    stride_D.reset(options.groups);
+    stride_D.copy_from_host(stride_D_host.data());
+
+    // Per-group alpha and beta ptrs
+    alpha_device.reset(options.groups);
+    alpha_device.copy_from_host(ptr_alpha_host.data());
+    beta_device.reset(options.groups);
+    beta_device.copy_from_host(ptr_beta_host.data());
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+    // Per-group alpha and beta values - note these are not directly passed to
+    // kernel - the pointers (alpha_device/beta_device) are passed instead
+    block_alpha.copy_from_host(alpha_host.data());
+    block_beta.copy_from_host(beta_host.data());
+  }
+
+  /// Populates a Gemm::Arguments structure from the given commandline options
+  typename Gemm::Arguments
+  args_from_options(const Options &options,
+                    const cutlass::KernelHardwareInfo &hw_info,
+                    bool host_problem_shapes_available = true) {
+    typename Gemm::Arguments arguments;
+    decltype(arguments.epilogue.thread) fusion_args;
+
+    if (options.alpha != FLT_MAX && options.beta != FLT_MAX) {
+      // If both alpha/beta are provided (via cmd line args) and are scalar,
+      // i.e., same alpha/beta applies to all batches.
+      fusion_args.alpha = options.alpha;
+      fusion_args.beta = options.beta;
+      fusion_args.alpha_ptr = nullptr;
+      fusion_args.beta_ptr = nullptr;
+      fusion_args.alpha_ptr_array = nullptr;
+      fusion_args.beta_ptr_array = nullptr;
+      // Single alpha and beta for all groups
+      fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 0};
+      fusion_args.dBeta = {cute::_0{}, cute::_0{}, 0};
+    } else {
+      // If pointers to alpha/beta are provided, i.e., alpha/beta can differ
+      // between batches/groups.
+      fusion_args.alpha = 0;
+      fusion_args.beta = 0;
+      fusion_args.alpha_ptr = nullptr;
+      fusion_args.beta_ptr = nullptr;
+      fusion_args.alpha_ptr_array = alpha_device.get();
+      fusion_args.beta_ptr_array = beta_device.get();
+      // One alpha and beta per each group
+      fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
+      fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
+    }
+    using RasterOrderOptions =
+        typename cutlass::gemm::kernel::detail::PersistentTileSchedulerXeGroup<
+            ProblemShape>::RasterOrderOptions;
+
+    // Per-GEMM problem shape info may only exist on the device.
+    if (host_problem_shapes_available) {
+      arguments = typename Gemm::Arguments{
+          cutlass::gemm::GemmUniversalMode::kGrouped,
+          {options.groups, problem_sizes.get(),
+           options.problem_sizes_host.data()},
+          {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()},
+          {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(),
+           stride_D.get()},
+          hw_info,
+          {1, RasterOrderOptions::AlongN}};
+    } else {
+      arguments = typename Gemm::Arguments{
+          cutlass::gemm::GemmUniversalMode::kGrouped,
+          {options.groups, problem_sizes.get(), nullptr},
+          {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()},
+          {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(),
+           stride_D.get()},
+          hw_info,
+          {1, RasterOrderOptions::AlongN}};
+    }
+
+    return arguments;
+  }
+
+  cutlass::Status run(const Options &options,
+                      const cutlass::KernelHardwareInfo &hw_info,
+                      bool host_problem_shapes_available = true) {
+    allocate(options);
+    initialize(options);
+
+    Gemm gemm_op;
+
+    auto arguments =
+        args_from_options(options, hw_info, host_problem_shapes_available);
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    CUTLASS_CHECK(gemm_op.can_implement(arguments));
+
+    CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
+
+    // Run the GEMM
+    CUTLASS_CHECK(gemm_op.run());
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(options);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (!passed)
+      return cutlass::Status::kErrorInternal;
+
+    if (options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int iter = 0; iter < options.iterations; ++iter) {
+        CUTLASS_CHECK(gemm_op.run());
+      }
+      syclcompat::wait();
+
+      float cute_time = timer.seconds() * 1000;
+      double cute_average_time = double(cute_time) / double(options.iterations);
+      double gflops = options.gflops(cute_average_time / 1000.0,
+                                     options.problem_sizes_host);
+
+      std::cout << "  Problem Sizes, Alpha, Beta " << std::endl;
+      for (int32_t i = 0; i < options.groups; ++i) {
+        std::cout << "    " << options.problem_sizes_host.at(i);
+        std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i)
+                  << std::endl;
+      }
+      std::cout << "  Groups      : " << options.groups << std::endl;
+      std::cout << "  Avg runtime : " << cute_average_time << " ms"
+                << std::endl;
+      std::cout << "  GFLOPS      : " << gflops << std::endl;
+    }
+
+    return cutlass::Status::kSuccess;
+  }
+};
+
+int main(int argc, const char **argv) {
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a
+  // given device ID. This information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with
+  // multiple GPUs and wish to use a GPU other than that with device ID 0.
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+
+  // Workgroup-level tile
+  using TileShape = Shape<_256, _256, _32>;
+
+  using TiledMma =
+      TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
+               Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>,
+               Tile<Layout<Shape<_8, _8, _4>, Stride<_1, _32, _8>>,
+                    Layout<Shape<_16, _4, _4>, Stride<_1, _64, _16>>, _32>>;
+
+  constexpr int PipelineStages = 2;
+  // Dispatch to grouped gemm algorithm
+  using GEMMDispatchPolicy =
+      cutlass::gemm::MainloopIntelXeXMX16Group<PipelineStages>;
+  using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16Group;
+
+  using EpilogueOp =
+      cutlass::epilogue::fusion::LinearCombination<float_t, float_t>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::IntelXe, cutlass::arch::OpClassTensorOp, TileShape,
+          Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueTileAuto,
+          float, float, float, LayoutC, 1, ElementOutput, LayoutC, 1,
+          EpilogueDispatchPolicy, EpilogueOp>::CollectiveOp;
+
+  // Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+      GEMMDispatchPolicy, TileShape, ElementA,
+      cutlass::gemm::TagToStrideA_t<LayoutA *>, ElementB,
+      cutlass::gemm::TagToStrideB_t<LayoutB *>, TiledMma, GmemTiledCopyA, void,
+      void, cute::identity,                      // A
+      GmemTiledCopyB, void, void, cute::identity // B
+      >;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue,
+                                           cutlass::gemm::GroupScheduler>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  CUTLASS_CHECK(runner.run(options, hw_info));
+
+  return 0;
+}
diff --git a/examples/04_bmg_grouped_gemm/CMakeLists.txt b/examples/04_bmg_grouped_gemm/CMakeLists.txt
index 5640b750cf..347ebeda9b 100644
--- a/examples/04_bmg_grouped_gemm/CMakeLists.txt
+++ b/examples/04_bmg_grouped_gemm/CMakeLists.txt
@@ -36,7 +36,15 @@ cutlass_example_add_executable(
   TEST_GROUPS_2
   TEST_GROUPS_4
 )
+cutlass_example_add_executable(
+  04_bmg_grouped_gemm_bf16_output
+  04_bmg_grouped_gemm_bf16_output.cpp
+  TEST_COMMAND_OPTIONS
+  TEST_GROUPS_2
+  TEST_GROUPS_4
+)
 if(NOT DPCPP_SYCL_TARGET STREQUAL "spir64")
   # TODO(codeplay): Remove these once IGC VectorAliasThreshold issue is fixed
   target_link_options( 04_bmg_grouped_gemm PRIVATE -Xs "-options \"-igc_opts 'VectorAliasBBThreshold=10000'\"" )
+  target_link_options( 04_bmg_grouped_gemm_bf16_output PRIVATE -Xs "-options \"-igc_opts 'VectorAliasBBThreshold=10000'\"" )  
 endif()
diff --git a/include/cutlass/epilogue/collective/builders/xe_builder.inl b/include/cutlass/epilogue/collective/builders/xe_builder.inl
index 809cede6f7..a4d8383a46 100644
--- a/include/cutlass/epilogue/collective/builders/xe_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/xe_builder.inl
@@ -191,9 +191,8 @@ template <
       using SmemLayoutAtomD_ = void;
       using CopyOpR2S_ = void;
 
-      //TODO(Codeplay): Should FusionCallbacks use DispatchPolicy IntelXeGroupEpilogue for group gemm? That does not work.
       using FusionCallbacks = typename detail::FusionOpInfo<FusionOpOrCallbacks>::template FusionCallbacks<
-                                  IntelXeXMX16, TileShape_MNK, TileShape_MNK, CopyOpG2R>;
+                                  std::conditional_t<IsGroup, IntelXeXMX16Group, IntelXeXMX16>, TileShape_MNK, TileShape_MNK, CopyOpG2R>;
 
       using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
             DispatchPolicy,