Merge remote-tracking branch 'origin/sycl-develop' into top_k_softmax

codeplaysoftware · Dec 16, 2024 · 6810808 · 6810808
2 parents 1f30048 + 0f258ae
commit 6810808
Show file tree

Hide file tree

Showing 44 changed files with 3,569 additions and 107 deletions.
diff --git a/.github/workflows/cuda_test.yml b/.github/workflows/cuda_test.yml
@@ -9,6 +9,10 @@ on:
 
 permissions: {}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   run-tests:
     name: Run cuda tests

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -13,6 +13,10 @@ on:
 
 permissions: {}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   run-tests:
     name: Run tests
@@ -71,13 +75,4 @@ jobs:
         shell: bash
         run: |
           export LD_LIBRARY_PATH=~/dpcpp/lib/:$LD_LIBRARY_PATH
-          echo Run sgemm_1
-          ./examples/cute/tutorial/sgemm_1
-          echo Run sgemm_2
-          ./examples/cute/tutorial/sgemm_2
-          echo Run sgemm_sm70
-          ./examples/cute/tutorial/sgemm_sm70
-          echo Run sgemm_sm80
-          ./examples/cute/tutorial/sgemm_sm80
-          echo Run tiled_copy
-          ./examples/cute/tutorial/tiled_copy
+          cmake --build . --target test_examples -j 24
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -94,6 +94,10 @@ endif()
 set(CUTLASS_ENABLE_SYCL OFF CACHE BOOL "Enable SYCL")
 set(CUTLASS_SYCL_PROFILING_ENABLED OFF CACHE BOOL "Use SYCL events to calculate device execution time")
 
+set(CUTLASS_SYCL_SWITCH_WG OFF CACHE BOOL "Enable SWITCH WG and for GEMM on Intel PVC during benchmarking")
+if(CUTLASS_SYCL_SWITCH_WG)
+  add_compile_definitions(CUTLASS_SYCL_SWITCH_WG)
+endif()
 if (CUTLASS_ENABLE_SYCL)
   set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 

diff --git a/benchmarks/pvc/benchmarks.hpp b/benchmarks/pvc/benchmarks.hpp
@@ -34,15 +34,62 @@
 #include "../benchmark_runner.hpp"
 #include "gemm_configuration.hpp"
 
-using PvcGemmBF16BF16FP32_RRR = cutlass::gemm::device::GemmConfiguration<
+using MMAAtom = MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>;
+using PvcGemmBF16BF16FP32_RRR_1 = cutlass::gemm::device::GemmConfiguration<
         cutlass::arch::IntelPVC,
         cutlass::bfloat16_t, cutlass::layout::RowMajor,
         cutlass::bfloat16_t, cutlass::layout::RowMajor,
         float, cutlass::layout::RowMajor,
-        float>;
+        float, Shape<_256, _256, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_8,_4,_1>>>,
+        XE_2D_U16x32x32_LD_N, XE_2D_U16x32x32_LD_V>;
 
-CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR);
+using PvcGemmBF16BF16FP32_RRR_2 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+        float, Shape<_128, _512, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_4,_8,_1>>>,
+        XE_2D_U16x32x32_LD_N, XE_2D_U16x32x32_LD_V>;
+
+using PvcGemmBF16BF16FP32_RRR_3 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+                float, Shape<_256, _128, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_8,_4,_1>>>,
+        XE_2D_U16x32x32_LD_N, XE_2D_U16x32x32_LD_V>;
+
+using PvcGemmBF16BF16FP32_RRR_4 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+        float, Shape<_128, _256, _16>,
+        TiledMMA<MMAAtom, Layout<Shape<_4,_8,_1>>>,
+        XE_2D_U16x32x16_LD_N, XE_2D_U16x16x32_LD_V>;
+
+using PvcGemmBF16BF16FP32_RRR_5 = cutlass::gemm::device::GemmConfiguration<
+        cutlass::arch::IntelPVC,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        cutlass::bfloat16_t, cutlass::layout::RowMajor,
+        float, cutlass::layout::RowMajor,
+        float, Shape<_8, _128, _32>,
+        TiledMMA<MMAAtom, Layout<Shape<_1,_4,_1>>>,
+        XE_2D_U16x8x32_LD_N, XE_2D_U16x32x32_LD_V>;
+
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_1);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_2);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_3);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_4);
+CUTLASS_CREATE_GEMM_BENCHMARK(PvcGemmBF16BF16FP32_RRR_5);
 
 static void register_benchmarks() {
-  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_1);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_2);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_3);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_4);
+  CUTLASS_BENCHMARK(PvcGemmBF16BF16FP32_RRR_5);
 }
diff --git a/benchmarks/pvc/gemm_configuration.hpp b/benchmarks/pvc/gemm_configuration.hpp
@@ -57,7 +57,9 @@ template<
   class ElementA, class LayoutA,
   class ElementB, class LayoutB,
   class ElementC, class LayoutC,
-  class ElementAccumulator>
+  class ElementAccumulator,
+  class TileShape, class TiledMma,
+  class GmemTiledCopyA, class GmemTiledCopyB>
 struct GemmConfiguration {
   static_assert(sizeof(ElementA) == 0, "No valid GemmConfiguration configuration exists.");
 };
@@ -66,47 +68,16 @@ struct GemmConfiguration {
 
 // bfloat16
 
-namespace detail {
-
-template<typename Element, typename Layout>
-struct Gemm_OperandA;
-
-template<typename Element, typename Layout>
-struct Gemm_OperandB;
-
-template<>
-struct Gemm_OperandA<bfloat16_t, layout::RowMajor> {
-  using GmemTiledCopy = XE_2D_U16x32x32_LD_N;
-};
-
-template<>
-struct Gemm_OperandB<bfloat16_t, layout::RowMajor> {
-  using GmemTiledCopy = XE_2D_U16x32x32_LD_V;
-};
-
-} // namespace details
-
-template<typename LayoutA, typename LayoutB, typename LayoutC>
+template<typename LayoutA, typename LayoutB, typename LayoutC,
+  class TileShape, class TiledMma, class GmemTiledCopyA, class GmemTiledCopyB>
 struct GemmConfiguration<
       arch::IntelPVC,
       bfloat16_t, LayoutA,
       bfloat16_t, LayoutB,
       float, LayoutC,
-      float> {
-  using TileShape = Shape<_256, _256, _32>;
-  using DispatchPolicy = MainloopIntelPVC<3>;;
-  using TiledMma = TiledMMA<
-    MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-    Layout<Shape<_8,_4,_1>>,
-    Tile<_64,_64,_32>>;
-
-  // A
-  using OperandA = detail::Gemm_OperandA<bfloat16_t, LayoutA>;
-  using GmemTiledCopyA = typename OperandA::GmemTiledCopy;
-
-  // B
-  using OperandB = detail::Gemm_OperandB<bfloat16_t, LayoutB>;
-  using GmemTiledCopyB = typename OperandB::GmemTiledCopy;
+      float, TileShape, TiledMma,
+      GmemTiledCopyA, GmemTiledCopyB> {
+  using DispatchPolicy = MainloopIntelPVC<3>;
 
   // Mainloop
   using CollectiveMainloop = collective::CollectiveMma<

diff --git a/benchmarks/pvc/input.in b/benchmarks/pvc/input.in
@@ -1,17 +1,23 @@
 # BFloat16 benchmarks
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=32768
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=32768 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=1024
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=1024 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=4096
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=4096 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=4096 --k=16384 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=4096
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=1024 --k=16384 --n=8192
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=1024
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=128 --n=16384
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=16384 --n=128
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=128 --n=4096
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=4096 --n=128
-PvcGemmBF16BF16FP32_RRR --bm_name=bf16_bf16_fp32 --l=32 --m=4096 --k=4096 --n=128
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=4096 --k=4096 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=8192 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=1 --k=5120 --n=13824
+PvcGemmBF16BF16FP32_RRR_2 --bm_name=bf16_bf16_fp32 --l=1 --m=1024 --k=28672 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=3072 --k=4096 --n=3072
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=4 --k=4096 --n=12288
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=8192 --n=32768
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=512 --k=32768 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=1024
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=1024 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=8192 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=16384 --k=4096 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=4096 --k=16384 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=1024 --k=16384 --n=8192
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=1 --m=8192 --k=16384 --n=1024
+PvcGemmBF16BF16FP32_RRR_4 --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=128 --n=16384
+PvcGemmBF16BF16FP32_RRR_5 --bm_name=bf16_bf16_fp32 --l=4096 --m=8 --k=16384 --n=128
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=128 --n=4096
+PvcGemmBF16BF16FP32_RRR_1 --bm_name=bf16_bf16_fp32 --l=4 --m=32768 --k=4096 --n=128
+PvcGemmBF16BF16FP32_RRR_3 --bm_name=bf16_bf16_fp32 --l=32 --m=4096 --k=4096 --n=128
diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
@@ -34,7 +34,9 @@ if (NOT CUTLASS_ENABLE_SYCL)
     )
 endif()
 
-cutlass_example_add_executable(
-  14_ampere_tf32_tensorop_gemm_cute
-  ampere_tf32_tensorop_gemm_cute.cu
-)
+if (NOT SYCL_INTEL_TARGET)
+  cutlass_example_add_executable(
+    14_ampere_tf32_tensorop_gemm_cute
+    ampere_tf32_tensorop_gemm_cute.cu
+  )
+endif()
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp
diff --git a/examples/35_gemm_softmax/CMakeLists.txt b/examples/35_gemm_softmax/CMakeLists.txt
@@ -29,8 +29,14 @@
 
 
 
+if (NOT CUTLASS_ENABLE_SYCL)
 cutlass_example_add_executable(
   35_gemm_softmax
   gemm_softmax.cu
   )
-
+else()
+cutlass_example_add_executable(
+  35_gemm_online_softmax
+  gemm_online_softmax.cpp
+  )
+endif()