From 75174e952844d9fb8eaa5c60b4e7ebe982f9033f Mon Sep 17 00:00:00 2001 From: Jiaxingla Date: Thu, 18 Apr 2024 01:54:32 -0700 Subject: [PATCH] direct big tile, got 280Tflops --- build.sh | 5 +- examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp | 101 +++++++++++------- .../tutorial/pvc_sycl/pvc_sycl_builtins.hpp | 10 +- include/cute/util/sycl_vec.hpp | 14 +-- 4 files changed, 80 insertions(+), 50 deletions(-) diff --git a/build.sh b/build.sh index f143f34165..971246a623 100644 --- a/build.sh +++ b/build.sh @@ -5,9 +5,8 @@ rm -rf $target export CPATH=$sycl_compiler_path:$sycl_compiler_path/include/:$sycl_compiler_path/include/sycl/:/opt/intel/oneapi/mkl/2024.1/include/ export LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/ export LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/:${sycl_compiler_path}/lib/ -#export IGC_EnableVISANoSchedule=1 +export IGC_EnableVISANoSchedule=1 export IGC_ShaderDumpEnable=1 export IGC_DumpToCustomDir=./mm_dumps_prefetch_coop -#export IGC_VATemp=1 +export IGC_VATemp=1 cmake .. -G Ninja -DCMAKE_CUDA_HOST_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CUDA_COMPILER=$cuda_path/bin/nvcc -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CXX_FLAGS=" -DITEM_SIZE_X=4 -DITEM_SIZE_Y=32 -DSG_SIZE_X=64 -DSG_SIZE_Y=ITEM_SIZE_Y -DWG_SIZE_X=256 -DWG_SIZE_Y=256 -DKK=2 -DPREFETCH_DEFAULT -lmkl_intel_lp64 -lmkl_sequential -lmkl_core" && ninja -v $target && ONEAPI_DEVICE_SELECTOR=level_zero:gpu $target - diff --git a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp index 0321467e69..daf2975c00 100644 --- a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp +++ b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp @@ -30,17 +30,31 @@ using dtype_acc = float; bool identityData = false; bool fixedData = false; bool validate = true; -int testIterations = 1; +int testIterations = 10; dtype_acc threshold = 0.01f; size_t matrixSize = 4096; #define B_VNNI -#define WARMUP_ITERATIONS 10 +#define WARMUP_ITERATIONS 100 #if !defined(PREFETCH_DISTANCE) #define PREFETCH_DISTANCE 1 #endif +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x +#else +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { assert(false); } +#endif + +SYCL_DEVICE_BUILTIN(ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, int2_ coord)); +SYCL_DEVICE_BUILTIN(uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, int2_ coord)); + std::string makeTestName(const std::string &func, int tM, int tN, int tK, int MM, int NN, size_t M, size_t N, size_t K) { std::ostringstream ret; @@ -187,38 +201,15 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec, auto B = b; auto C = c_vec; - Tensor tAr = make_tensor(Shape<_8, Int, Int>{}); - Tensor tBr = make_tensor(Shape<_8, Int, Int>{}); - Tensor tCr = - make_tensor(Shape<_8, Int, Int>{}); - - auto A_copy = make_xe_2d_copy( - make_tensor(make_gmem_ptr(A), make_shape(M, K))); - auto B_copy = make_xe_2d_copy( - make_tensor(make_gmem_ptr(B), make_shape(K, N))); - auto C_copy = make_xe_2d_copy( - make_tensor(make_gmem_ptr(C), make_shape(M, N))); - // TODO: - decide on how to deal with vector types - // - create layouts with tiling/partitioning - - Tensor tAi = make_tensor( - make_inttuple_iter(m, 0), - make_layout(make_shape(_1{}, Int{}, K), - make_stride(_1{}, tM * E<0>{}, E<1>{}))); - Tensor tBi = make_tensor( - make_inttuple_iter(0, n), - make_layout(make_shape(_1{}, K, Int{}), - make_stride(_1{}, E<0>{}, tN * E<1>{}))); - Tensor tCi = make_tensor( - make_inttuple_iter(m, n), - make_layout(Shape<_1, Int, Int>{}, - make_stride(_1{}, tM * E<0>{}, tN * E<1>{}))); - TiledMMA, - Layout>> - tiled_mma; - + float8 sum[NN][MM]; + for (int mm = 0; mm < MM; mm++) { + for (int nn = 0; nn < NN; nn++) { + sum[nn][mm] = 0; + } + } int prefetch_k = 0; #ifdef PREFETCH_DEFAULT + // if (k % ((PREFETCH_DISTANCE)*tK) == 0) { for (int p = 0; p < PREFETCH_DISTANCE; p++) { #ifdef B_VNNI HELPER_NAME(btile_block_prefetch_vnni, 4, 4) @@ -231,14 +222,34 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec, ((ushort *)A, tM, M, K, m, prefetch_k); prefetch_k += tK * KK; } + // } #endif for (int k = 0; k < K; k += tK * KK) { - for (int kk = 0; kk < KK; kk++) { - copy(A_copy, tAi(_, _, k + kk * tK), tAr(_, _, kk)); - copy(B_copy, tBi(_, (k + kk * tK) / 2, _), tBr(_, _, kk)); + short8 aData[2][4]; + int8 bData[4][2]; + + ushort64 tmpA = + __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( + (long)A, K * sizeof(ushort) - 1, M - 1, + K * sizeof(ushort) - 1, int2_{k, m}); + aData[0][0] = sycl::bit_cast(tmpA.lo.lo.lo); + aData[0][1] = sycl::bit_cast(tmpA.lo.lo.hi); + aData[0][2] = sycl::bit_cast(tmpA.lo.hi.lo); + aData[0][3] = sycl::bit_cast(tmpA.lo.hi.hi); + aData[1][0] = sycl::bit_cast(tmpA.hi.lo.lo); + aData[1][1] = sycl::bit_cast(tmpA.hi.lo.hi); + aData[1][2] = sycl::bit_cast(tmpA.hi.hi.lo); + aData[1][3] = sycl::bit_cast(tmpA.hi.hi.hi); + + for (int i = 0; i < NN; i++) { + uint16 tmpB = + __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( + (long)B, N * sizeof(uint) - 1, K - 1, + N * sizeof(uint) - 1, int2_{n + i * tN, k / 2}); + bData[i][0] = sycl::bit_cast(tmpB.lo); + bData[i][1] = sycl::bit_cast(tmpB.hi); } - #ifdef PREFETCH_DEFAULT // if (k % ((PREFETCH_DISTANCE)*tK) == 0) { for (int p = 0; p < PREFETCH_DISTANCE; p++) { @@ -255,10 +266,24 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec, } // } #endif - gemm(tiled_mma, tAr, tBr, tCr); + for (int kk = 0; kk < KK; kk++) { + for (int nn = 0; nn < NN; nn++) { + for (int mm = 0; mm < MM; mm++) { + sum[nn][mm] = intel_sub_group_bf16_bf16_matrix_mad_k16( + aData[kk][mm], bData[nn][kk], sum[nn][mm]); + } + } + } } - copy(C_copy, tCr, tCi); + for (int mm = 0; mm < MM; mm++) { + for (int nn = 0; nn < NN; nn++) { + __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( + (long)C, N * sizeof(float) - 1, M - 1, + N * sizeof(float) - 1, int2_{n + nn * tN, m + mm * tM}, + sycl::bit_cast(sum[nn][mm])); + } + } }); }); diff --git a/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp b/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp index 0f50a4b7f9..5df3d35809 100644 --- a/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp +++ b/examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp @@ -132,10 +132,16 @@ enum LSC_LDCC { }; typedef ushort __attribute__((ext_vector_type(32))) ushort32; -typedef ushort __attribute__((ext_vector_type(64))) ushort64; - typedef uint __attribute__((ext_vector_type(32))) uint32; +typedef ushort __attribute__((ext_vector_type(64))) ushort64; +typedef uint __attribute__((ext_vector_type(16))) uint16; +typedef uint __attribute__((ext_vector_type(8))) uint8; +typedef int __attribute__((ext_vector_type(8))) int8; +typedef ushort __attribute__((ext_vector_type(8))) ushort8; +typedef short __attribute__((ext_vector_type(8))) short8; +typedef float __attribute__((ext_vector_type(8))) float8; + #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x #else diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp index 264c95f629..2f523db692 100644 --- a/include/cute/util/sycl_vec.hpp +++ b/include/cute/util/sycl_vec.hpp @@ -9,12 +9,12 @@ template using vector_t = typename sycl::vec::vector_t; template using vector_t = sycl::vec; #endif -using float8 = vector_t; -using short8 = vector_t; -using ushort8 = vector_t; +// using float8 = vector_t; +// using short8 = vector_t; +// using ushort8 = vector_t; using int2_ = vector_t; //conflicts with vector_types -using int8 = vector_t; -using uint8 = vector_t; -using ushort16 = vector_t; -using uint16 = vector_t; +// using int8 = vector_t; +// using uint8 = vector_t; +// using ushort16 = vector_t; +// using uint16 = vector_t;