From 75174e952844d9fb8eaa5c60b4e7ebe982f9033f Mon Sep 17 00:00:00 2001
From: Jiaxingla <jiaxing.wen@intel.com>
Date: Thu, 18 Apr 2024 01:54:32 -0700
Subject: [PATCH] direct big tile, got 280Tflops

---
 build.sh                                      |   5 +-
 examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp  | 101 +++++++++++-------
 .../tutorial/pvc_sycl/pvc_sycl_builtins.hpp   |  10 +-
 include/cute/util/sycl_vec.hpp                |  14 +--
 4 files changed, 80 insertions(+), 50 deletions(-)
diff --git a/build.sh b/build.sh
index f143f34165..971246a623 100644
--- a/build.sh
+++ b/build.sh
@@ -5,9 +5,8 @@ rm -rf $target
 export CPATH=$sycl_compiler_path:$sycl_compiler_path/include/:$sycl_compiler_path/include/sycl/:/opt/intel/oneapi/mkl/2024.1/include/
 export LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/
 export LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/:${sycl_compiler_path}/lib/
-#export IGC_EnableVISANoSchedule=1
+export IGC_EnableVISANoSchedule=1
 export IGC_ShaderDumpEnable=1
 export IGC_DumpToCustomDir=./mm_dumps_prefetch_coop
-#export IGC_VATemp=1
+export IGC_VATemp=1
 cmake .. -G Ninja -DCMAKE_CUDA_HOST_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CUDA_COMPILER=$cuda_path/bin/nvcc -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CXX_FLAGS=" -DITEM_SIZE_X=4 -DITEM_SIZE_Y=32 -DSG_SIZE_X=64 -DSG_SIZE_Y=ITEM_SIZE_Y -DWG_SIZE_X=256 -DWG_SIZE_Y=256 -DKK=2 -DPREFETCH_DEFAULT -lmkl_intel_lp64 -lmkl_sequential -lmkl_core" && ninja -v $target && ONEAPI_DEVICE_SELECTOR=level_zero:gpu $target
-
diff --git a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
index 0321467e69..daf2975c00 100644
--- a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
+++ b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
@@ -30,17 +30,31 @@ using dtype_acc = float;
 bool identityData = false;
 bool fixedData = false;
 bool validate = true;
-int testIterations = 1;
+int testIterations = 10;
 dtype_acc threshold = 0.01f;
 size_t matrixSize = 4096;
 #define B_VNNI
 
-#define WARMUP_ITERATIONS 10
+#define WARMUP_ITERATIONS 100
 
 #if !defined(PREFETCH_DISTANCE)
 #define PREFETCH_DISTANCE 1
 #endif
 
+#ifdef __SYCL_DEVICE_ONLY__
+#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x
+#else
+#define SYCL_DEVICE_BUILTIN(x)                                                 \
+  inline x { assert(false); }
+#endif
+
+SYCL_DEVICE_BUILTIN(ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2(
+    long baseoffset, int width_minus_one, int height_minus_one,
+    int pitch_minus_one, int2_ coord));
+SYCL_DEVICE_BUILTIN(uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1(
+    long baseoffset, int width_minus_one, int height_minus_one,
+    int pitch_minus_one, int2_ coord));
+
 std::string makeTestName(const std::string &func, int tM, int tN, int tK,
                          int MM, int NN, size_t M, size_t N, size_t K) {
   std::ostringstream ret;
@@ -187,38 +201,15 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec,
               auto B = b;
               auto C = c_vec;
 
-              Tensor tAr = make_tensor<ushort>(Shape<_8, Int<MM>, Int<KK>>{});
-              Tensor tBr = make_tensor<uint>(Shape<_8, Int<NN>, Int<KK>>{});
-              Tensor tCr =
-                  make_tensor<dtype_acc>(Shape<_8, Int<MM>, Int<NN>>{});
-
-              auto A_copy = make_xe_2d_copy<XE_2D_LOAD>(
-                  make_tensor(make_gmem_ptr(A), make_shape(M, K)));
-              auto B_copy = make_xe_2d_copy<XE_2D_LOAD>(
-                  make_tensor(make_gmem_ptr(B), make_shape(K, N)));
-              auto C_copy = make_xe_2d_copy<XE_2D_SAVE>(
-                  make_tensor(make_gmem_ptr(C), make_shape(M, N)));
-              // TODO: - decide on how to deal with vector types
-              //       - create layouts with tiling/partitioning
-
-              Tensor tAi = make_tensor(
-                  make_inttuple_iter(m, 0),
-                  make_layout(make_shape(_1{}, Int<MM>{}, K),
-                              make_stride(_1{}, tM * E<0>{}, E<1>{})));
-              Tensor tBi = make_tensor(
-                  make_inttuple_iter(0, n),
-                  make_layout(make_shape(_1{}, K, Int<NN>{}),
-                              make_stride(_1{}, E<0>{}, tN * E<1>{})));
-              Tensor tCi = make_tensor(
-                  make_inttuple_iter(m, n),
-                  make_layout(Shape<_1, Int<MM>, Int<NN>>{},
-                              make_stride(_1{}, tM * E<0>{}, tN * E<1>{})));
-              TiledMMA<MMA_Atom<XE_8x16x16_BF16BF16F32F32_NN>,
-                       Layout<Shape<_1, _1, _1>>>
-                  tiled_mma;
-
+              float8 sum[NN][MM];
+              for (int mm = 0; mm < MM; mm++) {
+                for (int nn = 0; nn < NN; nn++) {
+                  sum[nn][mm] = 0;
+                }
+              }
               int prefetch_k = 0;
 #ifdef PREFETCH_DEFAULT
+              //   if (k % ((PREFETCH_DISTANCE)*tK) == 0) {
               for (int p = 0; p < PREFETCH_DISTANCE; p++) {
 #ifdef B_VNNI
                 HELPER_NAME(btile_block_prefetch_vnni, 4, 4)
@@ -231,14 +222,34 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec,
                 ((ushort *)A, tM, M, K, m, prefetch_k);
                 prefetch_k += tK * KK;
               }
+          //  }
 #endif
 
               for (int k = 0; k < K; k += tK * KK) {
-                for (int kk = 0; kk < KK; kk++) {
-                  copy(A_copy, tAi(_, _, k + kk * tK), tAr(_, _, kk));
-                  copy(B_copy, tBi(_, (k + kk * tK) / 2, _), tBr(_, _, kk));
+                short8 aData[2][4];
+                int8 bData[4][2];
+
+                ushort64 tmpA =
+                    __builtin_IB_subgroup_block_read_flat_u16_m32k16v2(
+                        (long)A, K * sizeof(ushort) - 1, M - 1,
+                        K * sizeof(ushort) - 1, int2_{k, m});
+                aData[0][0] = sycl::bit_cast<short8>(tmpA.lo.lo.lo);
+                aData[0][1] = sycl::bit_cast<short8>(tmpA.lo.lo.hi);
+                aData[0][2] = sycl::bit_cast<short8>(tmpA.lo.hi.lo);
+                aData[0][3] = sycl::bit_cast<short8>(tmpA.lo.hi.hi);
+                aData[1][0] = sycl::bit_cast<short8>(tmpA.hi.lo.lo);
+                aData[1][1] = sycl::bit_cast<short8>(tmpA.hi.lo.hi);
+                aData[1][2] = sycl::bit_cast<short8>(tmpA.hi.hi.lo);
+                aData[1][3] = sycl::bit_cast<short8>(tmpA.hi.hi.hi);
+
+                for (int i = 0; i < NN; i++) {
+                  uint16 tmpB =
+                      __builtin_IB_subgroup_block_read_flat_u32_m16k16v1(
+                          (long)B, N * sizeof(uint) - 1, K - 1,
+                          N * sizeof(uint) - 1, int2_{n + i * tN, k / 2});
+                  bData[i][0] = sycl::bit_cast<int8>(tmpB.lo);
+                  bData[i][1] = sycl::bit_cast<int8>(tmpB.hi);
                 }
-
 #ifdef PREFETCH_DEFAULT
                 //   if (k % ((PREFETCH_DISTANCE)*tK) == 0) {
                 for (int p = 0; p < PREFETCH_DISTANCE; p++) {
@@ -255,10 +266,24 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec,
                 }
             //  }
 #endif
-                gemm(tiled_mma, tAr, tBr, tCr);
+                for (int kk = 0; kk < KK; kk++) {
+                  for (int nn = 0; nn < NN; nn++) {
+                    for (int mm = 0; mm < MM; mm++) {
+                      sum[nn][mm] = intel_sub_group_bf16_bf16_matrix_mad_k16(
+                          aData[kk][mm], bData[nn][kk], sum[nn][mm]);
+                    }
+                  }
+                }
               }
 
-              copy(C_copy, tCr, tCi);
+              for (int mm = 0; mm < MM; mm++) {
+                for (int nn = 0; nn < NN; nn++) {
+                  __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(
+                      (long)C, N * sizeof(float) - 1, M - 1,
+                      N * sizeof(float) - 1, int2_{n + nn * tN, m + mm * tM},
+                      sycl::bit_cast<uint8>(sum[nn][mm]));
+                }
+              }
             });
       });
 
diff --git a/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp b/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp
index 0f50a4b7f9..5df3d35809 100644
--- a/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp
+++ b/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp
@@ -132,10 +132,16 @@ enum LSC_LDCC {
 };
 
 typedef ushort __attribute__((ext_vector_type(32))) ushort32;
-typedef ushort __attribute__((ext_vector_type(64))) ushort64;
-
 typedef uint __attribute__((ext_vector_type(32))) uint32;
 
+typedef ushort __attribute__((ext_vector_type(64))) ushort64;
+typedef uint __attribute__((ext_vector_type(16))) uint16;
+typedef uint __attribute__((ext_vector_type(8))) uint8;
+typedef int __attribute__((ext_vector_type(8))) int8;
+typedef ushort __attribute__((ext_vector_type(8))) ushort8;
+typedef short __attribute__((ext_vector_type(8))) short8;
+typedef float __attribute__((ext_vector_type(8))) float8;
+
 #ifdef __SYCL_DEVICE_ONLY__
 #define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x
 #else
diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp
index 264c95f629..2f523db692 100644
--- a/include/cute/util/sycl_vec.hpp
+++ b/include/cute/util/sycl_vec.hpp
@@ -9,12 +9,12 @@ template<class T, int N> using vector_t = typename sycl::vec<T,N>::vector_t;
 template<class T, int N> using vector_t = sycl::vec<T,N>;
 #endif
 
-using float8 = vector_t<float, 8>;
-using short8 = vector_t<short, 8>;
-using ushort8 = vector_t<ushort, 8>;
+// using float8 = vector_t<float, 8>;
+// using short8 = vector_t<short, 8>;
+// using ushort8 = vector_t<ushort, 8>;
 using int2_ = vector_t<int, 2>; //conflicts with vector_types
-using int8 = vector_t<int, 8>;
-using uint8 = vector_t<uint, 8>;
-using ushort16 = vector_t<ushort, 16>;
-using uint16 = vector_t<uint, 16>;
+// using int8 = vector_t<int, 8>;
+// using uint8 = vector_t<uint, 8>;
+// using ushort16 = vector_t<ushort, 16>;
+// using uint16 = vector_t<uint, 16>;