Skip to content

Commit

Permalink
Merge pull request #6 from taozha2/zt/jiaxing
Browse files Browse the repository at this point in the history
direct big tile, got 280Tflops
  • Loading branch information
jiyang1011 authored Apr 18, 2024
2 parents 82bc596 + 75174e9 commit f33c723
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 50 deletions.
5 changes: 2 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ rm -rf $target
export CPATH=$sycl_compiler_path:$sycl_compiler_path/include/:$sycl_compiler_path/include/sycl/:/opt/intel/oneapi/mkl/2024.1/include/
export LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/
export LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/:${sycl_compiler_path}/lib/
#export IGC_EnableVISANoSchedule=1
export IGC_EnableVISANoSchedule=1
export IGC_ShaderDumpEnable=1
export IGC_DumpToCustomDir=./mm_dumps_prefetch_coop
#export IGC_VATemp=1
export IGC_VATemp=1
cmake .. -G Ninja -DCMAKE_CUDA_HOST_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CUDA_COMPILER=$cuda_path/bin/nvcc -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CXX_FLAGS=" -DITEM_SIZE_X=4 -DITEM_SIZE_Y=32 -DSG_SIZE_X=64 -DSG_SIZE_Y=ITEM_SIZE_Y -DWG_SIZE_X=256 -DWG_SIZE_Y=256 -DKK=2 -DPREFETCH_DEFAULT -lmkl_intel_lp64 -lmkl_sequential -lmkl_core" && ninja -v $target && ONEAPI_DEVICE_SELECTOR=level_zero:gpu $target

101 changes: 63 additions & 38 deletions examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,31 @@ using dtype_acc = float;
bool identityData = false;
bool fixedData = false;
bool validate = true;
int testIterations = 1;
int testIterations = 10;
dtype_acc threshold = 0.01f;
size_t matrixSize = 4096;
#define B_VNNI

#define WARMUP_ITERATIONS 10
#define WARMUP_ITERATIONS 100

#if !defined(PREFETCH_DISTANCE)
#define PREFETCH_DISTANCE 1
#endif

#ifdef __SYCL_DEVICE_ONLY__
#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x
#else
#define SYCL_DEVICE_BUILTIN(x) \
inline x { assert(false); }
#endif

SYCL_DEVICE_BUILTIN(ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2(
long baseoffset, int width_minus_one, int height_minus_one,
int pitch_minus_one, int2_ coord));
SYCL_DEVICE_BUILTIN(uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1(
long baseoffset, int width_minus_one, int height_minus_one,
int pitch_minus_one, int2_ coord));

std::string makeTestName(const std::string &func, int tM, int tN, int tK,
int MM, int NN, size_t M, size_t N, size_t K) {
std::ostringstream ret;
Expand Down Expand Up @@ -187,38 +201,15 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec,
auto B = b;
auto C = c_vec;

Tensor tAr = make_tensor<ushort>(Shape<_8, Int<MM>, Int<KK>>{});
Tensor tBr = make_tensor<uint>(Shape<_8, Int<NN>, Int<KK>>{});
Tensor tCr =
make_tensor<dtype_acc>(Shape<_8, Int<MM>, Int<NN>>{});

auto A_copy = make_xe_2d_copy<XE_2D_LOAD>(
make_tensor(make_gmem_ptr(A), make_shape(M, K)));
auto B_copy = make_xe_2d_copy<XE_2D_LOAD>(
make_tensor(make_gmem_ptr(B), make_shape(K, N)));
auto C_copy = make_xe_2d_copy<XE_2D_SAVE>(
make_tensor(make_gmem_ptr(C), make_shape(M, N)));
// TODO: - decide on how to deal with vector types
// - create layouts with tiling/partitioning

Tensor tAi = make_tensor(
make_inttuple_iter(m, 0),
make_layout(make_shape(_1{}, Int<MM>{}, K),
make_stride(_1{}, tM * E<0>{}, E<1>{})));
Tensor tBi = make_tensor(
make_inttuple_iter(0, n),
make_layout(make_shape(_1{}, K, Int<NN>{}),
make_stride(_1{}, E<0>{}, tN * E<1>{})));
Tensor tCi = make_tensor(
make_inttuple_iter(m, n),
make_layout(Shape<_1, Int<MM>, Int<NN>>{},
make_stride(_1{}, tM * E<0>{}, tN * E<1>{})));
TiledMMA<MMA_Atom<XE_8x16x16_BF16BF16F32F32_NN>,
Layout<Shape<_1, _1, _1>>>
tiled_mma;

float8 sum[NN][MM];
for (int mm = 0; mm < MM; mm++) {
for (int nn = 0; nn < NN; nn++) {
sum[nn][mm] = 0;
}
}
int prefetch_k = 0;
#ifdef PREFETCH_DEFAULT
// if (k % ((PREFETCH_DISTANCE)*tK) == 0) {
for (int p = 0; p < PREFETCH_DISTANCE; p++) {
#ifdef B_VNNI
HELPER_NAME(btile_block_prefetch_vnni, 4, 4)
Expand All @@ -231,14 +222,34 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec,
((ushort *)A, tM, M, K, m, prefetch_k);
prefetch_k += tK * KK;
}
// }
#endif

for (int k = 0; k < K; k += tK * KK) {
for (int kk = 0; kk < KK; kk++) {
copy(A_copy, tAi(_, _, k + kk * tK), tAr(_, _, kk));
copy(B_copy, tBi(_, (k + kk * tK) / 2, _), tBr(_, _, kk));
short8 aData[2][4];
int8 bData[4][2];

ushort64 tmpA =
__builtin_IB_subgroup_block_read_flat_u16_m32k16v2(
(long)A, K * sizeof(ushort) - 1, M - 1,
K * sizeof(ushort) - 1, int2_{k, m});
aData[0][0] = sycl::bit_cast<short8>(tmpA.lo.lo.lo);
aData[0][1] = sycl::bit_cast<short8>(tmpA.lo.lo.hi);
aData[0][2] = sycl::bit_cast<short8>(tmpA.lo.hi.lo);
aData[0][3] = sycl::bit_cast<short8>(tmpA.lo.hi.hi);
aData[1][0] = sycl::bit_cast<short8>(tmpA.hi.lo.lo);
aData[1][1] = sycl::bit_cast<short8>(tmpA.hi.lo.hi);
aData[1][2] = sycl::bit_cast<short8>(tmpA.hi.hi.lo);
aData[1][3] = sycl::bit_cast<short8>(tmpA.hi.hi.hi);

for (int i = 0; i < NN; i++) {
uint16 tmpB =
__builtin_IB_subgroup_block_read_flat_u32_m16k16v1(
(long)B, N * sizeof(uint) - 1, K - 1,
N * sizeof(uint) - 1, int2_{n + i * tN, k / 2});
bData[i][0] = sycl::bit_cast<int8>(tmpB.lo);
bData[i][1] = sycl::bit_cast<int8>(tmpB.hi);
}

#ifdef PREFETCH_DEFAULT
// if (k % ((PREFETCH_DISTANCE)*tK) == 0) {
for (int p = 0; p < PREFETCH_DISTANCE; p++) {
Expand All @@ -255,10 +266,24 @@ static void go_dpas_blockread_vnni_tiled(sycl::queue queue, dtype_acc *c_vec,
}
// }
#endif
gemm(tiled_mma, tAr, tBr, tCr);
for (int kk = 0; kk < KK; kk++) {
for (int nn = 0; nn < NN; nn++) {
for (int mm = 0; mm < MM; mm++) {
sum[nn][mm] = intel_sub_group_bf16_bf16_matrix_mad_k16(
aData[kk][mm], bData[nn][kk], sum[nn][mm]);
}
}
}
}

copy(C_copy, tCr, tCi);
for (int mm = 0; mm < MM; mm++) {
for (int nn = 0; nn < NN; nn++) {
__builtin_IB_subgroup_block_write_flat_u32_m8k16v1(
(long)C, N * sizeof(float) - 1, M - 1,
N * sizeof(float) - 1, int2_{n + nn * tN, m + mm * tM},
sycl::bit_cast<uint8>(sum[nn][mm]));
}
}
});
});

Expand Down
10 changes: 8 additions & 2 deletions examples/cute/tutorial/pvc_sycl/pvc_sycl_builtins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,16 @@ enum LSC_LDCC {
};

typedef ushort __attribute__((ext_vector_type(32))) ushort32;
typedef ushort __attribute__((ext_vector_type(64))) ushort64;

typedef uint __attribute__((ext_vector_type(32))) uint32;

typedef ushort __attribute__((ext_vector_type(64))) ushort64;
typedef uint __attribute__((ext_vector_type(16))) uint16;
typedef uint __attribute__((ext_vector_type(8))) uint8;
typedef int __attribute__((ext_vector_type(8))) int8;
typedef ushort __attribute__((ext_vector_type(8))) ushort8;
typedef short __attribute__((ext_vector_type(8))) short8;
typedef float __attribute__((ext_vector_type(8))) float8;

#ifdef __SYCL_DEVICE_ONLY__
#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x
#else
Expand Down
14 changes: 7 additions & 7 deletions include/cute/util/sycl_vec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ template<class T, int N> using vector_t = typename sycl::vec<T,N>::vector_t;
template<class T, int N> using vector_t = sycl::vec<T,N>;
#endif

using float8 = vector_t<float, 8>;
using short8 = vector_t<short, 8>;
using ushort8 = vector_t<ushort, 8>;
// using float8 = vector_t<float, 8>;
// using short8 = vector_t<short, 8>;
// using ushort8 = vector_t<ushort, 8>;
using int2_ = vector_t<int, 2>; //conflicts with vector_types
using int8 = vector_t<int, 8>;
using uint8 = vector_t<uint, 8>;
using ushort16 = vector_t<ushort, 16>;
using uint16 = vector_t<uint, 16>;
// using int8 = vector_t<int, 8>;
// using uint8 = vector_t<uint, 8>;
// using ushort16 = vector_t<ushort, 16>;
// using uint16 = vector_t<uint, 16>;

0 comments on commit f33c723

Please sign in to comment.