Enable CUTE APIs (Copy, MMA etc.) for Intel GPU (PVC) (#131)

Add Cute components for Intel XE --------- Co-authored-by: jiyang1011 <[email protected]>
codeplaysoftware · Oct 22, 2024 · e0c3ceb · e0c3ceb
1 parent 321c531
commit e0c3ceb
Show file tree

Hide file tree

Showing 30 changed files with 8,433 additions and 894 deletions.
diff --git a/benchmarks/pvc/gemm_configuration.hpp b/benchmarks/pvc/gemm_configuration.hpp
@@ -76,12 +76,12 @@ struct Gemm_OperandB;
 
 template<>
 struct Gemm_OperandA<bfloat16_t, layout::RowMajor> {
-  using GmemTiledCopy = XE_2D_U16x8x16x4x2_LD_N;
+  using GmemTiledCopy = XE_2D_U16x8x16_LD_N;
 };
 
 template<>
 struct Gemm_OperandB<bfloat16_t, layout::RowMajor> {
-  using GmemTiledCopy = XE_2D_U16x16x16x2x2_V;
+  using GmemTiledCopy = XE_2D_U16x16x16_LD_V;
 };
 
 } // namespace details
@@ -93,12 +93,12 @@ struct GemmConfiguration<
       bfloat16_t, LayoutB,
       float, LayoutC,
       float> {
-  using TileShape = Shape<_256, _256, _32>;
+  using TileShape = Shape<_256, _256, _16>;
   using DispatchPolicy = MainloopIntelPVC<3>;;
   using TiledMma = TiledMMA<
     MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-    Layout<Shape<_1,_1,_1>>,
-    Tile<_32,_64,_32>>;
+    Layout<Shape<_1,_8,_1>>,
+    Tile<_64,_128,_16>>;
 
   // A
   using OperandA = detail::Gemm_OperandA<bfloat16_t, LayoutA>;
@@ -132,9 +132,9 @@ struct GemmConfiguration<
         float,
         TagToStrideC_t<LayoutC>,
         FusionCallBacks,
-        XE_2D_U32x8x16x1x1_LD_N,
+        XE_2D_U32x8x16_LD_N,
         void, void,
-        XE_2D_U32x8x16x1x1_ST_N,
+        XE_2D_U32x8x16_ST_N,
         void, void>;
 
   using GemmKernel = kernel::GemmUniversal<

diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp
@@ -306,15 +306,15 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N;
-  using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V;
+  using GmemTiledCopyA = XE_2D_U16x8x16_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x16x16_LD_V;
 
   // Workgroup-level tile
-  using TileShape = Shape<_256, _256, _32>;
+  using TileShape = Shape<_256, _128, _16>;
 
   using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-          Layout<Shape<_1,_1,_1>>,
-          Tile<_32,_64,_32>>; // Subgroup level-tile
+          Layout<Shape<_8,_2,_1>>,
+          Tile<_64,_32,_16>>; // Subgroup level-tile
 
   constexpr int PipelineStages = 3;
   using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC<PipelineStages>;
@@ -333,9 +333,9 @@ int main(int argc, const char** argv)
           ElementOutput,
           cutlass::gemm::TagToStrideC_t<LayoutD>,
           FusionCallBacks,
-          XE_2D_U32x8x16x1x1_LD_N,
+          XE_2D_U32x8x16_LD_N,
           void, void,
-          XE_2D_U32x8x16x1x1_ST_N,
+          XE_2D_U32x8x16_ST_N,
           void, void>;
 
 // Mainloop