diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h index fdfe1fc57..2bd5b6d89 100644 --- a/include/cutlass/cutlass.h +++ b/include/cutlass/cutlass.h @@ -115,135 +115,161 @@ static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2; //////////////////////////////////////////////////////////////////////////////////////////////////// CUTLASS_HOST_DEVICE uint ThreadIdxX() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return threadIdx.x; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::local_id::x(); #else - return threadIdx.x; + return 0; #endif } CUTLASS_HOST_DEVICE uint ThreadIdxY() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return threadIdx.y; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::local_id::y(); #else - return threadIdx.y; + return 0; #endif } CUTLASS_HOST_DEVICE uint ThreadIdxZ() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return threadIdx.z; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::local_id::z(); #else - return threadIdx.z; + return 0; #endif } CUTLASS_HOST_DEVICE uint BlockIdxX() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return blockIdx.x; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::work_group_id::x(); #else - return blockIdx.x; + return 0; #endif } CUTLASS_HOST_DEVICE uint BlockIdxY() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return blockIdx.y; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::work_group_id::y(); #else - return blockIdx.y; + return 0; #endif } CUTLASS_HOST_DEVICE uint BlockIdxZ() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return blockIdx.z; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::work_group_id::z(); #else - return blockIdx.z; + return 0; #endif } CUTLASS_HOST_DEVICE uint BlockDimX() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return blockDim.x; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::work_group_range::x(); #else - return blockDim.x; + return 0; #endif } CUTLASS_HOST_DEVICE uint BlockDimY() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return blockDim.y; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::work_group_range::y(); #else - return blockDim.y; + return 0; #endif } CUTLASS_HOST_DEVICE uint BlockDimZ() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return blockDim.z; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::work_group_range::z(); #else - return blockDim.z; + return 0; #endif } CUTLASS_HOST_DEVICE uint GridDimX() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return gridDim.x; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::global_range::x(); #else - return gridDim.x; + return 0; #endif } CUTLASS_HOST_DEVICE uint GridDimY() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return gridDim.y; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::global_range::y(); #else - return gridDim.y; + return 0; #endif } CUTLASS_HOST_DEVICE uint GridDimZ() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return gridDim.z; +#elif defined(__SYCL_Device_ONLY__) return syclcompat::global_range::z(); #else - return gridDim.z; + return 0; #endif } // syncthreads CUTLASS_DEVICE void syncthreads() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + __syncthreads(); +#elif defined(__SYCL_Device_ONLY__) syclcompat::wg_barrier(); -#else - __syncthreads(); #endif } CUTLASS_DEVICE int syncthreads_and(int cond) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return __syncthreads_and(cond); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); #else - return __syncthreads_and(cond); + return 0; #endif } CUTLASS_DEVICE void syncwarp() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + __syncwarp(); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); -#else - __syncwarp(); #endif } CUTLASS_DEVICE void threadfence() { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + __threadfence(); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); -#else - __threadfence(); #endif } @@ -251,11 +277,14 @@ CUTLASS_DEVICE void threadfence() { CUTLASS_DEVICE uint byte_perm(uint x, uint y, uint s) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return __byte_perm(x, y, s); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); + return 0; #else - return __byte_perm(x, y, s); + return 0; #endif } @@ -263,31 +292,40 @@ uint byte_perm(uint x, uint y, uint s) { CUTLASS_DEVICE uint shfl_up_sync(const unsigned mask, const uint var, const int delta, const int width = NumThreadsPerWarp) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return __shfl_up_sync(mask, var, delta, width); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); + return 0; #else - return __shfl_up_sync(mask, var, delta, width); + return 0; #endif } CUTLASS_DEVICE uint shfl_down_sync(const unsigned mask, const uint var, const int delta, const int width = NumThreadsPerWarp) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return __shfl_down_sync(mask, var, delta, width); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); + return 0; #else - return __shfl_down_sync(mask, var, delta, width); + return 0; #endif } CUTLASS_DEVICE uint shfl_sync(const unsigned mask, const uint var, const int delta, const int width = NumThreadsPerWarp) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return __shfl_sync(mask, var, delta, width); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); + return 0; #else - return __shfl_sync(mask, var, delta, width); + return 0; #endif } @@ -295,11 +333,14 @@ uint shfl_sync(const unsigned mask, const uint var, const int delta, const int w template CUTLASS_DEVICE T hfma2(const T a, const T b, const T c) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) + return hfma2(a, b, c); +#elif defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); + return T(0); #else - return hfma2(a, b, c); + return T(0); #endif } @@ -307,13 +348,21 @@ CUTLASS_DEVICE T hfma2(const T a, const T b, const T c) { #if defined(CUTLASS_ENABLE_SYCL) CUTLASS_DEVICE int atomicAdd(int *address, int val) { +#if defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); +#else + return 0; +#endif } CUTLASS_DEVICE int atomicCAS(int *address, int compare, int val) { +#if defined(__SYCL_Device_ONLY__) // TODO: Add SYCL equivalent function assert(false); +#else + return 0; +#endif } #endif