Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
f534d97
Port syclcompat to cutlass as cutlasscompat
leonling-lly Aug 21, 2025
e8f5ecb
poc unnamed
rolandschulz Aug 22, 2025
1ba6ecf
Fix unamed lambda in sycl example kernels
leonling-lly Aug 26, 2025
88f2dd2
Fix cute ut
leonling-lly Sep 1, 2025
34ec590
Fix benchmark
leonling-lly Sep 2, 2025
f94cd10
Fix ambiguous `logical_group` in cutlasscompat
leonling-lly Sep 2, 2025
92ec221
Fix test_unit_cute_intel_xe
leonling-lly Sep 2, 2025
0414e13
Fix icpx failure on test_unit_flash_attention_prefill
leonling-lly Sep 10, 2025
f4e6153
Resolve conflicts
leonling-lly Sep 10, 2025
9a9fa2f
Add workflow for testing intel g++ seperated compilation
leonling-lly Sep 10, 2025
ed57dbe
Fix CI failure
leonling-lly Sep 11, 2025
8166de4
Fix CI failure
leonling-lly Sep 11, 2025
765483b
Remove nightly test for g++ support
leonling-lly Sep 16, 2025
f63e4af
Update G++ support CI test name
leonling-lly Sep 16, 2025
d862a6f
Resolve merge conflicts
leonling-lly Sep 16, 2025
ae939b6
Print g++ version in env setup
leonling-lly Sep 17, 2025
74217e9
Use explicit g++-13 and fix clang++ failure
leonling-lly Sep 17, 2025
6da266d
Resolve conflicts
leonling-lly Sep 18, 2025
e7e5323
Resolve conflicts and add lisence header to changed files
leonling-lly Sep 17, 2025
7ccd224
Rename cutlasscompat to compat
rolandschulz Sep 19, 2025
56a9b09
Merge remote-tracking branch 'origin/main' into liyang/unnamed-poc
rolandschulz Sep 19, 2025
5251113
Merge branch 'intel:main' into liyang/unnamed-poc
leonling-lly Sep 22, 2025
ca5e99e
Fix duplicate -fsycl-host-compiler-options flags in flash attention d…
ratnampa Sep 22, 2025
86e7ec8
Address review comments
leonling-lly Sep 18, 2025
c0efa38
Fix typo
leonling-lly Sep 23, 2025
902f260
Merge branch 'main' into liyang/unnamed-poc
leonling-lly Sep 23, 2025
16a4d7f
update CI
ratnampa Sep 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions .github/workflows/intel_test_gpp_host.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
name: "SYCL Intel G++ Host Compilation Test"

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
merge_group:
branches: [ "main" ]
workflow_dispatch:
inputs:
DPCPP_VERSION:
description: "DPCPP version to use"
type: string

permissions: {}

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
run-tests:
strategy:
matrix:
include:
- compiler: RELEASE
gpu: BMG
intel_graphics: ROLLING
sycl_target: intel_gpu_bmg_g21
runner: bmg108629-01
- compiler: RELEASE
gpu: PVC
intel_graphics: ROLLING
sycl_target: intel_gpu_pvc
runner: pvc146162-01


name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
runs-on: ${{ matrix.runner }}
timeout-minutes: 120

steps:
- name: Checkout repository
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- name: Install Intel graphics drivers
uses: ./.github/actions/install-intel-graphics
with:
GPU: ${{ matrix.gpu }}
IGC: ${{ matrix.intel_graphics }}
- name: Install DPC++
uses: ./.github/actions/install-dpcpp
with:
DPCPP_RELEASE: ${{ matrix.compiler }}
DPCPP_VERSION: ${{ inputs.DPCPP_VERSION }}
GPU: ${{ matrix.gpu }}
IGC: ${{ matrix.intel_graphics }}
- name: Setup virtual environment
shell: bash
run: |
# Install cmake and ninja if not already available
if ! command -v cmake &> /dev/null || ! command -v ninja &> /dev/null; then
echo "Installing cmake and/or ninja..."
sudo apt update
sudo apt install -y cmake ninja-build
else
echo "cmake and ninja already available"
fi
. setvars.sh
export IGC_ExtraOCLOptions="-cl-intel-256-GRF-per-thread"
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file -gline-tables-only"
export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
export IGC_VectorAliasBBThreshold=100000000000
# Persist environment variables to following steps
env >> $GITHUB_ENV
which $CXX
$CXX --version
g++-13 --version
sycl-ls
- name: Build
shell: bash
run: |
cmake -G Ninja \
-DCUTLASS_ENABLE_SYCL=ON \
-DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
-DCUTLASS_SYCL_RUNNING_CI=ON \
-DDPCPP_HOST_COMPILER=g++-13
cmake --build .

- name: Unit test
shell: bash
run: |
# ninja test_unit_cute_core # Assertion failure in include/cutlass/integer_subbyte.h:105
ninja test_unit_cute_intel_xe
ninja test_unit_cute_layout
ninja test_unit_cute_msvc_compilation

- name: Examples
shell: bash
run: |
cmake --build . --target test_examples -j 1

- name: Benchmarks
shell: bash
run: |
ninja cutlass_benchmarks

- name: Cleanup DPC++
if: always()
shell: bash
run: |
echo "Cleaning up DPC++ installation..."
# Remove DPCPP directory if it exists
DPCPP_PATH="${{ inputs.DPCPP_PATH || '~/dpcpp' }}"
DPCPP_PATH=$(eval echo $DPCPP_PATH) # Expand ~ to home directory
if [ -d "$DPCPP_PATH" ]; then
echo "Removing DPCPP directory: $DPCPP_PATH"
sudo rm -rf "$DPCPP_PATH"
fi
# For RELEASE installs, remove OneAPI packages
if [[ "${{ matrix.compiler }}" == "RELEASE" ]]; then
echo "Removing OneAPI packages..."
sudo apt remove -y intel-oneapi-runtime-libs intel-oneapi-compiler-dpcpp-cpp || true
sudo rm -f /etc/apt/sources.list.d/oneAPI.list
sudo rm -f /usr/share/keyrings/oneapi-archive-keyring.gpg
fi
# Clean up environment files
rm -f setvars.sh
# Clean up build artifacts
rm -rf build/ || true
# Reset environment variables that might interfere
unset CC CXX CPLUS_INCLUDE_PATH C_INCLUDE_PATH LD_LIBRARY_PATH
unset IGC_ExtraOCLOptions SYCL_PROGRAM_COMPILE_OPTIONS ONEAPI_DEVICE_SELECTOR IGC_VectorAliasBBThreshold
echo "DPC++ cleanup completed"
20 changes: 10 additions & 10 deletions applications/dual_gemm/collective/xe_dual_gemm_mma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,20 @@ struct DualGemmMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_

using MmaAtomShape = typename TiledMma::AtomShape_MNK;

static constexpr auto BLK_M = get<0>(WorkgroupTileShape{});
static constexpr auto BLK_N = get<1>(WorkgroupTileShape{});
static constexpr auto BLK_K = get<2>(WorkgroupTileShape{});
static constexpr int BLK_M = get<0>(WorkgroupTileShape{});
static constexpr int BLK_N = get<1>(WorkgroupTileShape{});
static constexpr int BLK_K = get<2>(WorkgroupTileShape{});

static constexpr auto ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
static constexpr auto ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
static constexpr auto ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
static constexpr int ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
static constexpr int ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
static constexpr int ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());

static constexpr auto SG_M = ceil_div(BLK_M, ATOM_M);
static constexpr auto SG_N = ceil_div(BLK_N, ATOM_N);
static constexpr auto SG_K = ceil_div(BLK_K, ATOM_K);
static constexpr int SG_M = ceil_div(BLK_M, ATOM_M);
static constexpr int SG_N = ceil_div(BLK_N, ATOM_N);
static constexpr int SG_K = ceil_div(BLK_K, ATOM_K);
using SubgroupTileShape = Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;

static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});

using traits_load_A = Copy_Traits<GmemTiledCopyA, StrideA>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,13 @@ class FMHADecode {
static constexpr int ATOM_N = CollectiveMainloop::ATOM_N;
static constexpr int ATOM_K = CollectiveMainloop::ATOM_K;

static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
static constexpr int Vec = CollectiveMainloop::Vec; // 8
static constexpr int FragsM = CollectiveMainloop::FragsM; // 1
static constexpr int FragsN = CollectiveMainloop::FragsNS; // 4

static constexpr int VSlicer = get<1>(TileShapeOutput{}) / (get<1>(TileShapePV{}) * ATOM_N);
using AccumShape = decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, Int<get<1>(TileShapePV{}) / get<1>(MmaAtomShape())>{}, Int<VSlicer>{}));
using AccumShape = decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, get<1>(TileShapePV{}) / get<1>(MmaAtomShape()), Int<VSlicer>{}));

static_assert(FragsM == 1, "Limit the seq_len_qo to 1 MMA Atom worth of data per work-group.");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ foreach(name IN LISTS LIB_LIST)
target_include_directories(${name} PRIVATE ${CUTLASS_APPLICATIONS_DIR})
target_link_libraries(${name} PRIVATE CUTLASS cutlass_tools_util_includes benchmark::benchmark)
add_onemkl_to_target(TARGET ${name})
add_sycl_to_target(TARGET ${name})
# Add only SYCL include directories, not the full SYCL flags (to avoid duplication)
add_sycl_include_directories_to_target(${name})
endforeach()

cutlass_benchmark_add_executable(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {

#if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
using namespace compat::experimental;
auto event = launch<cutlass::device_kernel<FMHADecodeKernel>>(
auto event = launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(
launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
kernel_properties{sycl_exp::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>}},
params);
Expand All @@ -631,7 +631,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
sycl::ext::oneapi::experimental::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>
};
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>>(policy, params);
auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(policy, params);
#endif

EventManager::getInstance().addEvent(event);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
};
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
#endif

EventManager::getInstance().addEvent(event);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
};
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
#endif

EventManager::getInstance().addEvent(event);
Expand Down
4 changes: 4 additions & 0 deletions cmake/FindDPCPP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
add_library(DPCPP::DPCPP INTERFACE IMPORTED)

set(DPCPP_FLAGS "-fsycl;")
if(DPCPP_HOST_COMPILER)
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler=${DPCPP_HOST_COMPILER}")
list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=-Wno-changes-meaning -D$<JOIN:$<TARGET_PROPERTY:COMPILE_DEFINITIONS>, -D> -I$<JOIN:$<TARGET_PROPERTY:INCLUDE_DIRECTORIES>, -I>")
endif()
set(DPCPP_COMPILE_ONLY_FLAGS "")
set(DPCPP_LINK_ONLY_FLAGS "")

Expand Down
11 changes: 2 additions & 9 deletions examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,21 +200,14 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
};
PagedKVParams paged_kv_cache;

template <typename SrcT, typename DstT>
void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
compat::get_default_queue().parallel_for(size, [=](auto indx) {
d_dst[indx] = static_cast<DstT>(d_src[indx]);
}).wait();
}

template <typename T>
static constexpr bool is_fp8_v = cute::is_any_of_v<T, cute::float_e5m2_t, cute::float_e4m3_t>;

template <typename Tin> inline auto in_memory(cutlass::DeviceAllocation<Tin>& in) {
using outType = cutlass::DeviceAllocation<cute::conditional_t<is_fp8_v<Tin>, half_t, Tin>>;
if constexpr(is_fp8_v<Tin>) {
cutlass::DeviceAllocation<half_t> out(in.size());
convert_fp8_to_fp16<Tin, half_t>(in.get(), out.get(), in.size());
convert_dtype<Tin, half_t, ExampleRunner>(in.get(), out.get(), in.size());
return out;
} else {
return in;
Expand Down Expand Up @@ -651,7 +644,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
sycl::ext::oneapi::experimental::sub_group_size<FMHAKernel::DispatchPolicy::SubgroupSize>
};
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>>(policy, params);
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>, FMHAKernel>(policy, params);
#endif

EventManager::getInstance().addEvent(event);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillCachedKernel::DispatchPolicy::SubgroupSize>
};
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>>(policy, params);
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>, FMHAPrefillCachedKernel>(policy, params);
#endif

EventManager::getInstance().addEvent(event);
Expand Down
12 changes: 2 additions & 10 deletions examples/06_bmg_flash_attention/bmg_flash_attn_prefill_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,22 +171,14 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
cutlass::DeviceAllocation<int> device_cumulative_seqlen_q;
cutlass::DeviceAllocation<int> device_cumulative_seqlen_kv;


template <typename SrcT, typename DstT>
void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
compat::get_default_queue().parallel_for(size, [=](auto indx) {
d_dst[indx] = static_cast<DstT>(d_src[indx]);
}).wait();
}

template <typename T>
static constexpr bool is_fp8_v = cute::is_any_of_v<T, cute::float_e5m2_t, cute::float_e4m3_t>;

template <typename Tin> inline auto in_memory(cutlass::DeviceAllocation<Tin>& in) {
using outType = cute::conditional_t<is_fp8_v<Tin>, half_t, Tin>;
if constexpr(is_fp8_v<Tin>) {
cutlass::DeviceAllocation<outType> out(in.size());
convert_fp8_to_fp16<Tin, outType>(in.get(), out.get(), in.size());
convert_dtype<Tin, outType, ExampleRunner>(in.get(), out.get(), in.size());
return out;
} else {
return in;
Expand Down Expand Up @@ -516,7 +508,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillKernel::DispatchPolicy::SubgroupSize>
};
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillKernel>, FMHAPrefillKernel>(policy, params);
#endif

EventManager::getInstance().addEvent(event);
Expand Down
2 changes: 1 addition & 1 deletion examples/07_bmg_dual_gemm/07_bmg_dual_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ struct ExampleRunner {
sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
};
compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
#endif

EventManager::getInstance().addEvent(event);
Expand Down
4 changes: 2 additions & 2 deletions examples/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,12 @@ struct ExampleRunner {
cutlass::DeviceAllocation<half_t> block_B_fp16(block_B.size());

// fp8 -> fp16
convert_dtype<ElementA, half_t>(
convert_dtype<ElementA, half_t, ExampleRunner>(
block_A.get(),
block_A_fp16.get(),
block_A.size()
);
convert_dtype<ElementB, half_t>(
convert_dtype<ElementB, half_t, ExampleRunner>(
block_B.get(),
block_B_fp16.get(),
block_B.size()
Expand Down
10 changes: 2 additions & 8 deletions examples/08_bmg_gemm_f8/08_bmg_gemm_f8_scaling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,6 @@ struct ExampleRunner {
//
// Methods
//
template <typename SrcT, typename DstT>
void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
compat::get_default_queue().parallel_for(size, [=](auto indx) {
d_dst[indx] = static_cast<DstT>(d_src[indx]);
}).wait();
}

bool verify(const Options &options) {
using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
Expand Down Expand Up @@ -367,12 +361,12 @@ struct ExampleRunner {
initialize_block(block_B, seed + 2022);
initialize_block(block_C, seed + 2021);

convert_fp8_to_fp16<ElementA, half_t>(
convert_dtype<ElementA, half_t, ExampleRunner>(
block_A.get(),
block_A_dq.get(),
block_A.size()
);
convert_fp8_to_fp16<ElementB, half_t>(
convert_dtype<ElementB, half_t, ExampleRunner>(
block_B.get(),
block_B_dq.get(),
block_B.size()
Expand Down
4 changes: 2 additions & 2 deletions examples/09_bmg_grouped_gemm_f8/09_bmg_grouped_gemm_f8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,12 +268,12 @@ struct ExampleRunner {
cutlass::DeviceAllocation<half_t> block_B_fp16(block_B.size());

// fp8 -> fp16
convert_dtype<ElementType, half_t>(
convert_dtype<ElementType, half_t, ExampleRunner>(
block_A.get(),
block_A_fp16.get(),
block_A.size()
);
convert_dtype<ElementType, half_t>(
convert_dtype<ElementType, half_t, ExampleRunner>(
block_B.get(),
block_B_fp16.get(),
block_B.size()
Expand Down
6 changes: 4 additions & 2 deletions examples/common/sycl_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ bool is_close(T a, T b, float atol, float rtol) {
return std::abs((float)a - (float)b) <= atol + rtol * std::abs((float)b);
}

template <typename SrcT, typename DstT>
template <class, class, class> class convert_dtype_name;

template <typename SrcT, typename DstT, typename Runner>
void convert_dtype(const SrcT* d_src, DstT* d_dst, size_t size) {
compat::get_default_queue().parallel_for(size, [=](auto indx) {
compat::get_default_queue().parallel_for<convert_dtype_name<SrcT, DstT, Runner>>(size, [=](auto indx) {
d_dst[indx] = static_cast<DstT>(d_src[indx]);
}).wait();
}
Loading