intel · rolandschulz · Sep 24, 2025 · Aug 21, 2025 · Aug 22, 2025 · Aug 26, 2025
diff --git a/.github/workflows/intel_test_gpp_host.yml b/.github/workflows/intel_test_gpp_host.yml
@@ -0,0 +1,134 @@
+name: "SYCL Intel G++ Host Compilation Test"
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  merge_group:
+    branches: [ "main" ]
+  workflow_dispatch:
+    inputs:
+      DPCPP_VERSION:
+        description: "DPCPP version to use"
+        type: string
+
+permissions: {}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run-tests:
+    strategy:
+      matrix:
+        include:
+          - compiler: RELEASE
+            gpu: BMG
+            intel_graphics: ROLLING
+            sycl_target: intel_gpu_bmg_g21
+            runner: bmg108629-01
+          - compiler: RELEASE
+            gpu: PVC
+            intel_graphics: ROLLING
+            sycl_target: intel_gpu_pvc
+            runner: pvc146162-01
+
+
+    name: Run Intel ${{ matrix.compiler }} tests on ${{ matrix.gpu }} with intel-graphics ${{ matrix.intel_graphics }}
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+      - name: Install Intel graphics drivers
+        uses: ./.github/actions/install-intel-graphics
+        with:
+          GPU: ${{ matrix.gpu }}
+          IGC: ${{ matrix.intel_graphics }}
+      - name: Install DPC++
+        uses: ./.github/actions/install-dpcpp
+        with:
+          DPCPP_RELEASE: ${{ matrix.compiler }}
+          DPCPP_VERSION: ${{ inputs.DPCPP_VERSION }}
+          GPU: ${{ matrix.gpu }}
+          IGC: ${{ matrix.intel_graphics }}
+      - name: Setup virtual environment
+        shell: bash
+        run: |
+          # Install cmake and ninja if not already available
+          if ! command -v cmake &> /dev/null || ! command -v ninja &> /dev/null; then
+            echo "Installing cmake and/or ninja..."
+            sudo apt update
+            sudo apt install -y cmake ninja-build
+          else
+            echo "cmake and ninja already available"
+          fi
+          . setvars.sh
+          export IGC_ExtraOCLOptions="-cl-intel-256-GRF-per-thread"
+          export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file -gline-tables-only"
+          export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
+          export IGC_VectorAliasBBThreshold=100000000000
+          # Persist environment variables to following steps 
+          env >> $GITHUB_ENV
+          which $CXX
+          $CXX --version
+          g++-13 --version
+          sycl-ls
+      - name: Build
+        shell: bash
+        run: |
+          cmake -G Ninja  \
+            -DCUTLASS_ENABLE_SYCL=ON \
+            -DDPCPP_SYCL_TARGET=${{ matrix.sycl_target }} \
+            -DCUTLASS_SYCL_RUNNING_CI=ON \
+            -DDPCPP_HOST_COMPILER=g++-13
+          cmake --build .
+
+      - name: Unit test
+        shell: bash
+        run: |
+          # ninja test_unit_cute_core           # Assertion failure in include/cutlass/integer_subbyte.h:105
+          ninja test_unit_cute_intel_xe
+          ninja test_unit_cute_layout
+          ninja test_unit_cute_msvc_compilation
+
+      - name: Examples
+        shell: bash
+        run: |
+          cmake --build . --target test_examples -j 1
+
+      - name: Benchmarks
+        shell: bash
+        run: |
+          ninja cutlass_benchmarks
+
+      - name: Cleanup DPC++
+        if: always()
+        shell: bash
+        run: |
+          echo "Cleaning up DPC++ installation..."
+          # Remove DPCPP directory if it exists
+          DPCPP_PATH="${{ inputs.DPCPP_PATH || '~/dpcpp' }}"
+          DPCPP_PATH=$(eval echo $DPCPP_PATH)  # Expand ~ to home directory
+          if [ -d "$DPCPP_PATH" ]; then
+            echo "Removing DPCPP directory: $DPCPP_PATH"
+            sudo rm -rf "$DPCPP_PATH"
+          fi
+          # For RELEASE installs, remove OneAPI packages
+          if [[ "${{ matrix.compiler }}" == "RELEASE" ]]; then
+            echo "Removing OneAPI packages..."
+            sudo apt remove -y intel-oneapi-runtime-libs intel-oneapi-compiler-dpcpp-cpp || true
+            sudo rm -f /etc/apt/sources.list.d/oneAPI.list
+            sudo rm -f /usr/share/keyrings/oneapi-archive-keyring.gpg
+          fi
+          # Clean up environment files
+          rm -f setvars.sh
+          # Clean up build artifacts
+          rm -rf build/ || true
+          # Reset environment variables that might interfere
+          unset CC CXX CPLUS_INCLUDE_PATH C_INCLUDE_PATH LD_LIBRARY_PATH
+          unset IGC_ExtraOCLOptions SYCL_PROGRAM_COMPILE_OPTIONS ONEAPI_DEVICE_SELECTOR IGC_VectorAliasBBThreshold
+          echo "DPC++ cleanup completed"
diff --git a/applications/dual_gemm/collective/xe_dual_gemm_mma.hpp b/applications/dual_gemm/collective/xe_dual_gemm_mma.hpp
@@ -74,20 +74,20 @@ struct DualGemmMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_
 
   using MmaAtomShape = typename TiledMma::AtomShape_MNK;
 
-  static constexpr auto BLK_M = get<0>(WorkgroupTileShape{});
-  static constexpr auto BLK_N = get<1>(WorkgroupTileShape{});
-  static constexpr auto BLK_K = get<2>(WorkgroupTileShape{});
+  static constexpr int BLK_M = get<0>(WorkgroupTileShape{});
+  static constexpr int BLK_N = get<1>(WorkgroupTileShape{});
+  static constexpr int BLK_K = get<2>(WorkgroupTileShape{});
 
-  static constexpr auto ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
-  static constexpr auto ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
-  static constexpr auto ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_M = get<1>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_N = get<2>(typename TiledMma::ThrLayoutVMNK{}.shape());
+  static constexpr int ATOM_K = get<3>(typename TiledMma::ThrLayoutVMNK{}.shape());
 
-  static constexpr auto SG_M = ceil_div(BLK_M, ATOM_M);
-  static constexpr auto SG_N = ceil_div(BLK_N, ATOM_N);
-  static constexpr auto SG_K = ceil_div(BLK_K, ATOM_K);
+  static constexpr int SG_M = ceil_div(BLK_M, ATOM_M);
+  static constexpr int SG_N = ceil_div(BLK_N, ATOM_N);
+  static constexpr int SG_K = ceil_div(BLK_K, ATOM_K);
   using SubgroupTileShape = Shape<decltype(SG_M), decltype(SG_N), decltype(SG_K)>;
 
-  static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
+  static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
   static constexpr uint32_t MaxThreadsPerBlock = size(TiledMma{});
 
   using traits_load_A = Copy_Traits<GmemTiledCopyA, StrideA>;

diff --git a/applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp b/applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp
@@ -126,13 +126,13 @@ class FMHADecode {
   static constexpr int ATOM_N = CollectiveMainloop::ATOM_N;
   static constexpr int ATOM_K = CollectiveMainloop::ATOM_K;
 
-  static constexpr auto Num_SGs = ATOM_N * ATOM_M * ATOM_K;
+  static constexpr int Num_SGs = ATOM_N * ATOM_M * ATOM_K;
   static constexpr int Vec = CollectiveMainloop::Vec; // 8
   static constexpr int FragsM = CollectiveMainloop::FragsM;  // 1
   static constexpr int FragsN = CollectiveMainloop::FragsNS; // 4
 
   static constexpr int VSlicer = get<1>(TileShapeOutput{}) / (get<1>(TileShapePV{}) * ATOM_N);
-  using AccumShape =  decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, Int<get<1>(TileShapePV{}) / get<1>(MmaAtomShape())>{}, Int<VSlicer>{}));
+  using AccumShape =  decltype(make_shape(Int<Vec>{}, Int<FragsM>{}, get<1>(TileShapePV{}) / get<1>(MmaAtomShape()), Int<VSlicer>{}));
 
   static_assert(FragsM == 1, "Limit the seq_len_qo to 1 MMA Atom worth of data per work-group.");
 

diff --git a/benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt b/benchmarks/flash_attention/flash_attention_decode/CMakeLists.txt
@@ -60,7 +60,8 @@ foreach(name IN LISTS LIB_LIST)
   target_include_directories(${name} PRIVATE ${CUTLASS_APPLICATIONS_DIR})
   target_link_libraries(${name} PRIVATE CUTLASS cutlass_tools_util_includes benchmark::benchmark)
   add_onemkl_to_target(TARGET ${name})
-  add_sycl_to_target(TARGET ${name})
+  # Add only SYCL include directories, not the full SYCL flags (to avoid duplication)
+  add_sycl_include_directories_to_target(${name})
 endforeach()
 
 cutlass_benchmark_add_executable(

diff --git a/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp b/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp
@@ -619,7 +619,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
     using namespace compat::experimental;
-    auto event = launch<cutlass::device_kernel<FMHADecodeKernel>>(
+    auto event = launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>}},
         params);
@@ -631,7 +631,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
       sycl::ext::oneapi::experimental::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>, FMHADecodeKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);

diff --git a/benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp b/benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp
@@ -491,7 +491,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);

diff --git a/benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp b/benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp
@@ -584,7 +584,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);

diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
@@ -39,6 +39,10 @@ find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
 add_library(DPCPP::DPCPP INTERFACE IMPORTED)
 
 set(DPCPP_FLAGS "-fsycl;")
+if(DPCPP_HOST_COMPILER)
+  list(APPEND DPCPP_FLAGS "-fsycl-host-compiler=${DPCPP_HOST_COMPILER}")
+  list(APPEND DPCPP_FLAGS "-fsycl-host-compiler-options=-Wno-changes-meaning -D$<JOIN:$<TARGET_PROPERTY:COMPILE_DEFINITIONS>, -D> -I$<JOIN:$<TARGET_PROPERTY:INCLUDE_DIRECTORIES>, -I>")
+endif()
 set(DPCPP_COMPILE_ONLY_FLAGS "")
 set(DPCPP_LINK_ONLY_FLAGS "")
 

diff --git a/examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp
@@ -200,21 +200,14 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
   };
   PagedKVParams paged_kv_cache;
 
-  template <typename SrcT, typename DstT>
-  void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    compat::get_default_queue().parallel_for(size, [=](auto indx) {
-      d_dst[indx] = static_cast<DstT>(d_src[indx]);
-    }).wait();
-  }
-
   template <typename T>
   static constexpr bool is_fp8_v = cute::is_any_of_v<T, cute::float_e5m2_t, cute::float_e4m3_t>;
 
   template <typename Tin> inline auto in_memory(cutlass::DeviceAllocation<Tin>& in) {
     using outType = cutlass::DeviceAllocation<cute::conditional_t<is_fp8_v<Tin>, half_t, Tin>>;
     if constexpr(is_fp8_v<Tin>) {
       cutlass::DeviceAllocation<half_t> out(in.size());
-      convert_fp8_to_fp16<Tin, half_t>(in.get(), out.get(), in.size());
+      convert_dtype<Tin, half_t, ExampleRunner>(in.get(), out.get(), in.size());
       return out;
     } else { 
       return in;
@@ -651,7 +644,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
       sycl::ext::oneapi::experimental::sub_group_size<FMHAKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>, FMHAKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);

diff --git a/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp
@@ -627,7 +627,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
       sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillCachedKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>, FMHAPrefillCachedKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);

diff --git a/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_runner.hpp
@@ -171,22 +171,14 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
   cutlass::DeviceAllocation<int> device_cumulative_seqlen_q;
   cutlass::DeviceAllocation<int> device_cumulative_seqlen_kv;
 
-
-  template <typename SrcT, typename DstT>
-  void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    compat::get_default_queue().parallel_for(size, [=](auto indx) {
-      d_dst[indx] = static_cast<DstT>(d_src[indx]);
-    }).wait();
-  }
-
   template <typename T>
   static constexpr bool is_fp8_v = cute::is_any_of_v<T, cute::float_e5m2_t, cute::float_e4m3_t>;
 
   template <typename Tin> inline auto in_memory(cutlass::DeviceAllocation<Tin>& in) {
     using outType = cute::conditional_t<is_fp8_v<Tin>, half_t, Tin>;
     if constexpr(is_fp8_v<Tin>) {
       cutlass::DeviceAllocation<outType> out(in.size());
-      convert_fp8_to_fp16<Tin, outType>(in.get(), out.get(), in.size());
+      convert_dtype<Tin, outType, ExampleRunner>(in.get(), out.get(), in.size());
       return out;
     } else { 
       return in;
@@ -516,7 +508,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
       sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillKernel>, FMHAPrefillKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);

diff --git a/examples/07_bmg_dual_gemm/07_bmg_dual_gemm.cpp b/examples/07_bmg_dual_gemm/07_bmg_dual_gemm.cpp
@@ -342,7 +342,7 @@ struct ExampleRunner {
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
     compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>, GemmKernel>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);

diff --git a/examples/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp b/examples/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp
@@ -181,12 +181,12 @@ struct ExampleRunner {
       cutlass::DeviceAllocation<half_t> block_B_fp16(block_B.size());
 
       // fp8 -> fp16
-      convert_dtype<ElementA, half_t>(
+      convert_dtype<ElementA, half_t, ExampleRunner>(
           block_A.get(),
           block_A_fp16.get(),
           block_A.size()
       );
-      convert_dtype<ElementB, half_t>(
+      convert_dtype<ElementB, half_t, ExampleRunner>(
           block_B.get(),
           block_B_fp16.get(),
           block_B.size()

diff --git a/examples/08_bmg_gemm_f8/08_bmg_gemm_f8_scaling.cpp b/examples/08_bmg_gemm_f8/08_bmg_gemm_f8_scaling.cpp
@@ -212,12 +212,6 @@ struct ExampleRunner {
   //
   // Methods
   //
-  template <typename SrcT, typename DstT>
-  void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    compat::get_default_queue().parallel_for(size, [=](auto indx) {
-      d_dst[indx] = static_cast<DstT>(d_src[indx]);
-    }).wait();
-  }
 
   bool verify(const Options &options) {
     using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
@@ -367,12 +361,12 @@ struct ExampleRunner {
     initialize_block(block_B, seed + 2022);
     initialize_block(block_C, seed + 2021);
 
-    convert_fp8_to_fp16<ElementA, half_t>(
+    convert_dtype<ElementA, half_t, ExampleRunner>(
         block_A.get(),
         block_A_dq.get(),
         block_A.size()
     );
-    convert_fp8_to_fp16<ElementB, half_t>(
+    convert_dtype<ElementB, half_t, ExampleRunner>(
         block_B.get(),
         block_B_dq.get(),
         block_B.size()

diff --git a/examples/09_bmg_grouped_gemm_f8/09_bmg_grouped_gemm_f8.cpp b/examples/09_bmg_grouped_gemm_f8/09_bmg_grouped_gemm_f8.cpp
@@ -268,12 +268,12 @@ struct ExampleRunner {
       cutlass::DeviceAllocation<half_t> block_B_fp16(block_B.size());
 
       // fp8 -> fp16
-      convert_dtype<ElementType, half_t>(
+      convert_dtype<ElementType, half_t, ExampleRunner>(
           block_A.get(),
           block_A_fp16.get(),
           block_A.size()
       );
-      convert_dtype<ElementType, half_t>(
+      convert_dtype<ElementType, half_t, ExampleRunner>(
           block_B.get(),
           block_B_fp16.get(),
           block_B.size()

diff --git a/examples/common/sycl_common.hpp b/examples/common/sycl_common.hpp
@@ -43,9 +43,11 @@ bool is_close(T a, T b, float atol, float rtol) {
   return std::abs((float)a - (float)b) <= atol + rtol * std::abs((float)b);
 }
 
-template <typename SrcT, typename DstT>
+template <class, class, class> class convert_dtype_name;
+
+template <typename SrcT, typename DstT, typename Runner>
 void convert_dtype(const SrcT* d_src, DstT* d_dst, size_t size) {
-  compat::get_default_queue().parallel_for(size, [=](auto indx) {
+  compat::get_default_queue().parallel_for<convert_dtype_name<SrcT, DstT, Runner>>(size, [=](auto indx) {
     d_dst[indx] = static_cast<DstT>(d_src[indx]);
   }).wait();
 }