diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 09ed858f6a..ef584284f8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -2,9 +2,9 @@ name: "CodeQL" on: push: - branches: [ "sycl-develop" ] + branches: [ "main" ] pull_request: - branches: [ "sycl-develop" ] + branches: [ "main" ] schedule: - cron: '23 5 * * 5' @@ -79,3 +79,10 @@ jobs: uses: github/codeql-action/analyze@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8 with: category: "/language:${{matrix.language}}" + + - name: Upload Sarif Artifact + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: codeql-results-${{ matrix.language }} + path: ./results/${{ matrix.language }}.sarif + retention-days: 7 diff --git a/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm_bf16_output.cpp b/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm_bf16_output.cpp new file mode 100644 index 0000000000..f1bbb477a9 --- /dev/null +++ b/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm_bf16_output.cpp @@ -0,0 +1,659 @@ +/*************************************************************************************************** + * Copyright (c) 2025 - 2026 Intel Corporation. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief CUTLASS Intel BMG Group Gemm + + This example demonstrates fusing multiple GEMM operations into one kernel. + + Note that the scalar arguments to e.g. the standard 00_bmg_gemm example, + have been replaced with vector equivalents, as each individual GEMM has its + own inputs and outputs, which needn't be contiguous in memory. For example, + where 00_bmg_gemm receives an `ElementA *` defining Matrix A, grouped gemm + receives a `ElementA **`, i.e. a pointer to pointers, each pointing to a + distinct Matrix A. Likewise, each individual GEMM operation may have its own + alpha and beta factors for linear combination. This example demonstrates two + approaches: the user can provide `options.alpha` and `options.beta`, in which + case they will apply to all GEMMs; otherwise, random values are generated per + GEMM. + + Group GEMM scheduling (cutlass::gemm::GroupScheduler) is more complex than + standard GEMM, because each GEMM may have a unique size, only known at + runtime. Thus, the scheduler will distribute an a priori unknown number of + tiles to each work-group. See + include/cutlass/gemm/kernel/xe_gemm_array_cooperative.hpp for + implementation. + + Note that for simplicity, this example sets every GEMM in the group to the + same shape. + + Verification for this example is a conventional GEMM kernel, executed + iteratively per group. + + To build & run this example (from your build dir): + + $ ninja 04_bmg_grouped_gemm + $ ./examples/sycl/04_bmg_grouped_gemm/04_bmg_grouped_gemm + + Call with `--help` for information about available options. + + Note: the code may spill registers once compiled which will result in + sub-optimal performance. This is because of an issue inside Intel Graphics + Compiler (IGC) related to VectorAliasBBThreshold being debugged internally. + To avoid register spills, build the example by setting the environment + variable: $ export IGC_VectorAliasBBThreshold=10000 +*/ +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/collective/xe_array_epilogue.hpp" +#include "cutlass/epilogue/fusion/xe_callbacks.hpp" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/group_array_problem_shape.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include +#include + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "helper.h" +#include "sycl_common.hpp" + +#include + +using namespace cute; +using ProblemShape = + cutlass::gemm::GroupProblemShape>; // per group + +using ElementAccumulator = float; // <- data type of accumulator +using ElementComputeEpilogue = float; // <- data type of epilogue operations +using ElementA = bfloat16_t; // <- data type of elements in input matrix A +using ElementB = bfloat16_t; // <- data type of elements in input matrix B +using ElementOutput = bfloat16_t; // <- data type of elements in output matrix D + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#define CUTLASS_SYCL_PROFILING_ENABLED + +// Command line options parsing +struct Options { + + bool error = false; + bool help = false; + + float alpha, beta; + int iterations; + int m, n, k, groups; + std::vector problem_sizes_host; + + Options() + : error(false), help(false), alpha(FLT_MAX), beta(FLT_MAX), + iterations(100), m(5120), n(4096), k(4096), groups(2) { + problem_sizes_host.reserve(groups); + for (int i = 0; i < groups; i++) { + problem_sizes_host.push_back({m, n, k}); + } + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 5120); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("groups", groups, 2); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + + assert(groups > 0); + problem_sizes_host.clear(); + problem_sizes_host.reserve(groups); + for (int i = 0; i < groups; i++) { + problem_sizes_host.push_back({m, n, k}); + } + } + + /// Prints the usage statement. + std::ostream &print_usage(std::ostream &out) const { + + out << "BMG Grouped GEMM\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage " + "statement\n\n" + << " --m= Sets the M extent of the GEMM for " + "all groups\n" + << " --n= Sets the N extent of the GEMM for " + "all groups\n" + << " --k= Sets the K extent of the GEMM for " + "all groups\n" + << " --groups= Sets the number of individual GEMM " + "problems for Grouped GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Number of profiling iterations to " + "perform\n\n"; + + out << "\n\nExamples:\n\n" + << "$ " << "bmg_grouped_gemm" + << " --m=5120 --n=4096 --k=4096 --groups=5 --alpha=2.5 --beta=0.5 \n\n"; + + return out; + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s, + std::vector + problem_sizes_host) const { + // Number of real-valued multiply-adds + uint64_t fmas = uint64_t(); + + for (auto const &problem : problem_sizes_host) { + fmas += static_cast(get<0>(problem)) * + static_cast(get<1>(problem)) * + static_cast(get<2>(problem)); + } + // Two flops per multiply-add + uint64_t flop = uint64_t(2) * uint64_t(fmas); + double gflop = double(flop) / double(1.0e9); + return gflop / runtime_s; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template struct ExampleRunner { + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementC = typename Gemm::ElementC; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementOutput = bfloat16_t; + using ElementAccumulator = float_t; + + using StrideA = typename Gemm::GemmKernel::InternalStrideA; + using StrideB = typename Gemm::GemmKernel::InternalStrideB; + using StrideC = typename Gemm::GemmKernel::InternalStrideC; + using StrideD = typename Gemm::GemmKernel::InternalStrideD; + + // Host-side allocations + std::vector offset_A; + std::vector offset_B; + std::vector offset_C; + std::vector offset_D; + + std::vector stride_A_host; + std::vector stride_B_host; + std::vector stride_C_host; + std::vector stride_D_host; + + std::vector alpha_host; + std::vector beta_host; + + // Device-side allocations + cutlass::DeviceAllocation + problem_sizes; + + // This example defines all matrices in a single allocation (e.g. block_A), + // but this is not a requirement. Matrix base pointers are read from device + // allocation (e.g. ptr_A) + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + cutlass::DeviceAllocation ptr_A; + cutlass::DeviceAllocation ptr_B; + cutlass::DeviceAllocation ptr_C; + cutlass::DeviceAllocation ptr_D; + cutlass::DeviceAllocation ptr_ref_D; + + cutlass::DeviceAllocation stride_A; + cutlass::DeviceAllocation stride_B; + cutlass::DeviceAllocation stride_C; + cutlass::DeviceAllocation stride_D; + + // Note, this is an array of pointers to alpha and beta scaling values per + // group + cutlass::DeviceAllocation alpha_device; + cutlass::DeviceAllocation beta_device; + cutlass::DeviceAllocation block_alpha; + cutlass::DeviceAllocation block_beta; + + uint64_t seed = 0; + + // + // Methods + // + + bool verify(const Options &options) { + bool passed = true; + // Verify against individual reference GEMMs + for (int32_t i = 0; i < options.groups; ++i) { + auto problem = options.problem_sizes_host.at(i); + auto M = get<0>(problem); + auto N = get<1>(problem); + auto K = get<2>(problem); + cutlass::TensorRef ref_A(block_A.get() + offset_A.at(i), + LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get() + offset_B.at(i), + LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get() + offset_C.at(i), + LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get() + offset_D.at(i), + LayoutD::packed({M, N})); + + // + // Compute reference output + // + cutlass::reference::device::GemmComplex( + {M, N, K}, alpha_host.at(i), ref_A, cutlass::ComplexTransform::kNone, + ref_B, cutlass::ComplexTransform::kNone, beta_host.at(i), ref_C, + ref_D, ElementAccumulator(0), + 1, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + // Wait for kernel to finish + syclcompat::wait(); + + // Check if output from CUTLASS kernel and reference kernel are equal or + // not + passed &= cutlass::reference::device::BlockCompareEqual( + block_ref_D.get() + offset_D.at(i), block_D.get() + offset_D.at(i), + M * N); + if (!passed) + break; + } + return passed; + } + + /// Allocates device-side data + void allocate(const Options &options) { + int64_t total_elements_A = 0; + int64_t total_elements_B = 0; + int64_t total_elements_C = 0; + int64_t total_elements_D = 0; + + // Compute total allocation sizes across group + for (int32_t i = 0; i < options.groups; ++i) { + + auto problem = options.problem_sizes_host.at(i); + auto M = get<0>(problem); + auto N = get<1>(problem); + auto K = get<2>(problem); + + // Offset into block allocation of each matrix base pointer + offset_A.push_back(total_elements_A); + offset_B.push_back(total_elements_B); + offset_C.push_back(total_elements_C); + offset_D.push_back(total_elements_D); + + int64_t elements_A = M * K; + int64_t elements_B = K * N; + int64_t elements_C = M * N; + int64_t elements_D = M * N; + + total_elements_A += elements_A; + total_elements_B += elements_B; + total_elements_C += elements_C; + total_elements_D += elements_D; + + stride_A_host.push_back( + cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1})); + stride_B_host.push_back( + cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1})); + stride_C_host.push_back( + cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1})); + stride_D_host.push_back( + cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1})); + } + + block_A.reset(total_elements_A); + block_B.reset(total_elements_B); + block_C.reset(total_elements_C); + block_D.reset(total_elements_D); + block_ref_D.reset(total_elements_D); + block_alpha.reset(options.groups); + block_beta.reset(options.groups); + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const Options &options) { + + uint64_t seed = 2020; + + problem_sizes.reset(options.groups); + problem_sizes.copy_from_host(options.problem_sizes_host.data()); + + // + // Assign pointers + // + + std::vector ptr_A_host(options.groups); + std::vector ptr_B_host(options.groups); + std::vector ptr_C_host(options.groups); + std::vector ptr_D_host(options.groups); + std::vector ptr_alpha_host(options.groups); + std::vector ptr_beta_host(options.groups); + + // Compute offsets, alpha & beta over group on host + for (int32_t i = 0; i < options.groups; ++i) { + ptr_A_host.at(i) = block_A.get() + offset_A.at(i); + ptr_B_host.at(i) = block_B.get() + offset_B.at(i); + ptr_C_host.at(i) = block_C.get() + offset_C.at(i); + ptr_D_host.at(i) = block_D.get() + offset_D.at(i); + // Fill host vector of alpha & beta with random values if using per-group + // values + alpha_host.push_back( + (options.alpha == FLT_MAX) + ? static_cast((rand() % 5) + 1) + : options.alpha); + beta_host.push_back((options.beta == FLT_MAX) + ? static_cast(rand() % 5) + : options.beta); + // Fill host ptr vectors with offset addresses into device alpha/beta + // blocks + ptr_alpha_host.at(i) = block_alpha.get() + i; + ptr_beta_host.at(i) = block_beta.get() + i; + } + + // Allocate device memory & copy from host + ptr_A.reset(options.groups); + // Per-group alpha and beta + ptr_A.copy_from_host(ptr_A_host.data()); + + ptr_B.reset(options.groups); + ptr_B.copy_from_host(ptr_B_host.data()); + + ptr_C.reset(options.groups); + ptr_C.copy_from_host(ptr_C_host.data()); + + ptr_D.reset(options.groups); + ptr_D.copy_from_host(ptr_D_host.data()); + + stride_A.reset(options.groups); + stride_A.copy_from_host(stride_A_host.data()); + + stride_B.reset(options.groups); + stride_B.copy_from_host(stride_B_host.data()); + + stride_C.reset(options.groups); + stride_C.copy_from_host(stride_C_host.data()); + + stride_D.reset(options.groups); + stride_D.copy_from_host(stride_D_host.data()); + + // Per-group alpha and beta ptrs + alpha_device.reset(options.groups); + alpha_device.copy_from_host(ptr_alpha_host.data()); + beta_device.reset(options.groups); + beta_device.copy_from_host(ptr_beta_host.data()); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); + // Per-group alpha and beta values - note these are not directly passed to + // kernel - the pointers (alpha_device/beta_device) are passed instead + block_alpha.copy_from_host(alpha_host.data()); + block_beta.copy_from_host(beta_host.data()); + } + + /// Populates a Gemm::Arguments structure from the given commandline options + typename Gemm::Arguments + args_from_options(const Options &options, + const cutlass::KernelHardwareInfo &hw_info, + bool host_problem_shapes_available = true) { + typename Gemm::Arguments arguments; + decltype(arguments.epilogue.thread) fusion_args; + + if (options.alpha != FLT_MAX && options.beta != FLT_MAX) { + // If both alpha/beta are provided (via cmd line args) and are scalar, + // i.e., same alpha/beta applies to all batches. + fusion_args.alpha = options.alpha; + fusion_args.beta = options.beta; + fusion_args.alpha_ptr = nullptr; + fusion_args.beta_ptr = nullptr; + fusion_args.alpha_ptr_array = nullptr; + fusion_args.beta_ptr_array = nullptr; + // Single alpha and beta for all groups + fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 0}; + fusion_args.dBeta = {cute::_0{}, cute::_0{}, 0}; + } else { + // If pointers to alpha/beta are provided, i.e., alpha/beta can differ + // between batches/groups. + fusion_args.alpha = 0; + fusion_args.beta = 0; + fusion_args.alpha_ptr = nullptr; + fusion_args.beta_ptr = nullptr; + fusion_args.alpha_ptr_array = alpha_device.get(); + fusion_args.beta_ptr_array = beta_device.get(); + // One alpha and beta per each group + fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1}; + fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1}; + } + using RasterOrderOptions = + typename cutlass::gemm::kernel::detail::PersistentTileSchedulerXeGroup< + ProblemShape>::RasterOrderOptions; + + // Per-GEMM problem shape info may only exist on the device. + if (host_problem_shapes_available) { + arguments = typename Gemm::Arguments{ + cutlass::gemm::GemmUniversalMode::kGrouped, + {options.groups, problem_sizes.get(), + options.problem_sizes_host.data()}, + {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()}, + {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), + stride_D.get()}, + hw_info, + {1, RasterOrderOptions::AlongN}}; + } else { + arguments = typename Gemm::Arguments{ + cutlass::gemm::GemmUniversalMode::kGrouped, + {options.groups, problem_sizes.get(), nullptr}, + {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()}, + {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), + stride_D.get()}, + hw_info, + {1, RasterOrderOptions::AlongN}}; + } + + return arguments; + } + + cutlass::Status run(const Options &options, + const cutlass::KernelHardwareInfo &hw_info, + bool host_problem_shapes_available = true) { + allocate(options); + initialize(options); + + Gemm gemm_op; + + auto arguments = + args_from_options(options, hw_info, host_problem_shapes_available); + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + CUTLASS_CHECK(gemm_op.can_implement(arguments)); + + CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get())); + + // Run the GEMM + CUTLASS_CHECK(gemm_op.run()); + + syclcompat::wait(); + + // Verify that the result is correct + bool passed = verify(options); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (!passed) + return cutlass::Status::kErrorInternal; + + if (options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int iter = 0; iter < options.iterations; ++iter) { + CUTLASS_CHECK(gemm_op.run()); + } + syclcompat::wait(); + + float cute_time = timer.seconds() * 1000; + double cute_average_time = double(cute_time) / double(options.iterations); + double gflops = options.gflops(cute_average_time / 1000.0, + options.problem_sizes_host); + + std::cout << " Problem Sizes, Alpha, Beta " << std::endl; + for (int32_t i = 0; i < options.groups; ++i) { + std::cout << " " << options.problem_sizes_host.at(i); + std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i) + << std::endl; + } + std::cout << " Groups : " << options.groups << std::endl; + std::cout << " Avg runtime : " << cute_average_time << " ms" + << std::endl; + std::cout << " GFLOPS : " << gflops << std::endl; + } + + return cutlass::Status::kSuccess; + } +}; + +int main(int argc, const char **argv) { + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a + // given device ID. This information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with + // multiple GPUs and wish to use a GPU other than that with device ID 0. + hw_info.sm_count = + cutlass::KernelHardwareInfo::query_device_multiprocessor_count( + hw_info.device_id); + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; + using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; + + // Workgroup-level tile + using TileShape = Shape<_256, _256, _32>; + + using TiledMma = + TiledMMA, + Layout, Stride<_4, _1, _0>>, + Tile, Stride<_1, _32, _8>>, + Layout, Stride<_1, _64, _16>>, _32>>; + + constexpr int PipelineStages = 2; + // Dispatch to grouped gemm algorithm + using GEMMDispatchPolicy = + cutlass::gemm::MainloopIntelXeXMX16Group; + using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16Group; + + using EpilogueOp = + cutlass::epilogue::fusion::LinearCombination; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::IntelXe, cutlass::arch::OpClassTensorOp, TileShape, + Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueTileAuto, + float, float, float, LayoutC, 1, ElementOutput, LayoutC, 1, + EpilogueDispatchPolicy, EpilogueOp>::CollectiveOp; + + // Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + GEMMDispatchPolicy, TileShape, ElementA, + cutlass::gemm::TagToStrideA_t, ElementB, + cutlass::gemm::TagToStrideB_t, TiledMma, GmemTiledCopyA, void, + void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + using GemmKernel = + cutlass::gemm::kernel::GemmUniversal; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + CUTLASS_CHECK(runner.run(options, hw_info)); + + return 0; +} diff --git a/examples/04_bmg_grouped_gemm/CMakeLists.txt b/examples/04_bmg_grouped_gemm/CMakeLists.txt index 5640b750cf..347ebeda9b 100644 --- a/examples/04_bmg_grouped_gemm/CMakeLists.txt +++ b/examples/04_bmg_grouped_gemm/CMakeLists.txt @@ -36,7 +36,15 @@ cutlass_example_add_executable( TEST_GROUPS_2 TEST_GROUPS_4 ) +cutlass_example_add_executable( + 04_bmg_grouped_gemm_bf16_output + 04_bmg_grouped_gemm_bf16_output.cpp + TEST_COMMAND_OPTIONS + TEST_GROUPS_2 + TEST_GROUPS_4 +) if(NOT DPCPP_SYCL_TARGET STREQUAL "spir64") # TODO(codeplay): Remove these once IGC VectorAliasThreshold issue is fixed target_link_options( 04_bmg_grouped_gemm PRIVATE -Xs "-options \"-igc_opts 'VectorAliasBBThreshold=10000'\"" ) + target_link_options( 04_bmg_grouped_gemm_bf16_output PRIVATE -Xs "-options \"-igc_opts 'VectorAliasBBThreshold=10000'\"" ) endif() diff --git a/include/cutlass/epilogue/collective/builders/xe_builder.inl b/include/cutlass/epilogue/collective/builders/xe_builder.inl index 809cede6f7..a4d8383a46 100644 --- a/include/cutlass/epilogue/collective/builders/xe_builder.inl +++ b/include/cutlass/epilogue/collective/builders/xe_builder.inl @@ -191,9 +191,8 @@ template < using SmemLayoutAtomD_ = void; using CopyOpR2S_ = void; - //TODO(Codeplay): Should FusionCallbacks use DispatchPolicy IntelXeGroupEpilogue for group gemm? That does not work. using FusionCallbacks = typename detail::FusionOpInfo::template FusionCallbacks< - IntelXeXMX16, TileShape_MNK, TileShape_MNK, CopyOpG2R>; + std::conditional_t, TileShape_MNK, TileShape_MNK, CopyOpG2R>; using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue< DispatchPolicy,