Skip to content

Commit

Permalink
Merge branch 'intel_gpu_backend_enabling' into add_pvc_example
Browse files Browse the repository at this point in the history
  • Loading branch information
taozha2 authored Apr 17, 2024
2 parents 352a581 + 26af71f commit 4b4fa9a
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 51 deletions.
2 changes: 2 additions & 0 deletions include/cute/atom/copy_atom.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,8 @@ print_latex_copy(LayoutS const& S, ThrIDS const& TS, // (m,n) -> (tid,vid) and
#include <cute/atom/copy_traits_sm90_tma.hpp>
#endif

#if defined(CUTLASS_ENABLE_SYCL)
#include <cute/atom/copy_traits_xe.hpp>
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////
139 changes: 94 additions & 45 deletions include/cutlass/cutlass.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,205 +115,254 @@ static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2;
////////////////////////////////////////////////////////////////////////////////////////////////////

CUTLASS_HOST_DEVICE uint ThreadIdxX() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return threadIdx.x;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::local_id::x();
#else
return threadIdx.x;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint ThreadIdxY() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return threadIdx.y;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::local_id::y();
#else
return threadIdx.y;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint ThreadIdxZ() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return threadIdx.z;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::local_id::z();
#else
return threadIdx.z;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint BlockIdxX() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return blockIdx.x;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::work_group_id::x();
#else
return blockIdx.x;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint BlockIdxY() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return blockIdx.y;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::work_group_id::y();
#else
return blockIdx.y;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint BlockIdxZ() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return blockIdx.z;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::work_group_id::z();
#else
return blockIdx.z;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint BlockDimX() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return blockDim.x;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::work_group_range::x();
#else
return blockDim.x;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint BlockDimY() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return blockDim.y;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::work_group_range::y();
#else
return blockDim.y;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint BlockDimZ() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return blockDim.z;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::work_group_range::z();
#else
return blockDim.z;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint GridDimX() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return gridDim.x;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::global_range::x();
#else
return gridDim.x;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint GridDimY() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return gridDim.y;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::global_range::y();
#else
return gridDim.y;
return 0;
#endif
}

CUTLASS_HOST_DEVICE uint GridDimZ() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return gridDim.z;
#elif defined(__SYCL_DEVICE_ONLY__)
return syclcompat::global_range::z();
#else
return gridDim.z;
return 0;
#endif
}

// syncthreads

CUTLASS_DEVICE void syncthreads() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
__syncthreads();
#elif defined(__SYCL_DEVICE_ONLY__)
syclcompat::wg_barrier();
#else
__syncthreads();
#endif
}

CUTLASS_DEVICE int syncthreads_and(int cond) {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return __syncthreads_and(cond);
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
#else
return __syncthreads_and(cond);
return 0;
#endif
}

CUTLASS_DEVICE void syncwarp() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
__syncwarp();
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
#else
__syncwarp();
#endif
}

CUTLASS_DEVICE void threadfence() {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
__threadfence();
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
#else
__threadfence();
#endif
}

// byte perm

CUTLASS_DEVICE
uint byte_perm(uint x, uint y, uint s) {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return __byte_perm(x, y, s);
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
return 0;
#else
return __byte_perm(x, y, s);
return 0;
#endif
}

// shfl

CUTLASS_DEVICE
uint shfl_up_sync(const unsigned mask, const uint var, const int delta, const int width = NumThreadsPerWarp) {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return __shfl_up_sync(mask, var, delta, width);
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
return 0;
#else
return __shfl_up_sync(mask, var, delta, width);
return 0;
#endif
}

CUTLASS_DEVICE
uint shfl_down_sync(const unsigned mask, const uint var, const int delta, const int width = NumThreadsPerWarp) {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return __shfl_down_sync(mask, var, delta, width);
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
return 0;
#else
return __shfl_down_sync(mask, var, delta, width);
return 0;
#endif
}

CUTLASS_DEVICE
uint shfl_sync(const unsigned mask, const uint var, const int delta, const int width = NumThreadsPerWarp) {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return __shfl_sync(mask, var, delta, width);
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
return 0;
#else
return __shfl_sync(mask, var, delta, width);
return 0;
#endif
}

// math

template <typename T>
CUTLASS_DEVICE T hfma2(const T a, const T b, const T c) {
#if defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA_ARCH__)
return hfma2(a, b, c);
#elif defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
return T(0);
#else
return hfma2(a, b, c);
return T(0);
#endif
}

// atomic

#if defined(CUTLASS_ENABLE_SYCL)
CUTLASS_DEVICE int atomicAdd(int *address, int val) {
#if defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
#else
return 0;
#endif
}

CUTLASS_DEVICE int atomicCAS(int *address, int compare, int val) {
#if defined(__SYCL_DEVICE_ONLY__)
// TODO: Add SYCL equivalent function
assert(false);
#else
return 0;
#endif
}
#endif

Expand Down
29 changes: 23 additions & 6 deletions tools/util/include/cutlass/util/GPU_Clock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,16 @@

#pragma once

#include <cuda_runtime.h>

#ifdef CUTLASS_ENABLE_SYCL
#if defined(CUTLASS_ENABLE_SYCL)
#include <syclcompat.hpp>
#include <chrono>
#else
#include <cuda_runtime.h>
#endif

struct GPU_Clock
{
#if !defined(CUTLASS_ENABLE_SYCL)
GPU_Clock() {
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
Expand All @@ -49,29 +51,44 @@ struct GPU_Clock
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
}
#endif

void start() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
#endif
start_ = std::chrono::high_resolution_clock::now();
#else
cudaEventRecord(start_);
#endif
}

float milliseconds() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
#endif
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<float, std::milli> time = stop - start_;
return time.count();
#else
cudaEventRecord(stop_);
cudaEventSynchronize(stop_);
float time;
cudaEventElapsedTime(&time, start_, stop_);
return time;
#endif
}

float seconds() {
return milliseconds() * float(1e-3);
}

private:
cudaEvent_t start_, stop_;
#if defined(CUTLASS_ENABLE_SYCL)
typedef std::chrono::nanoseconds duration;
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::time_point<high_resolution_clock, duration> time_point;

time_point start_ = std::chrono::high_resolution_clock::now();
#else
cudaEvent_t start_, stop_;
#endif
};

0 comments on commit 4b4fa9a

Please sign in to comment.