From 521d420e350d2255e6f0221235cfa724d0900419 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 29 Jul 2024 14:17:03 +0100 Subject: [PATCH] Migrate example to sycl --- .../CMakeLists.txt | 17 +++--- .../ampere_tf32_tensorop_gemm_cute.cu | 52 ++++++++++++++++++- examples/common/helper.h | 36 +++++++++++-- include/cutlass/tfloat32.h | 4 +- 4 files changed, 90 insertions(+), 19 deletions(-) diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt index e6bb308b47..876afdfba7 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt +++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt @@ -27,19 +27,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -if (CUTLASS_ENABLE_SYCL) - cutlass_example_add_executable( - 14_ampere_tf32_tensorop_gemm_cute - ampere_tf32_tensorop_gemm_cute.cu - ) -else() +if (NOT CUTLASS_ENABLE_SYCL) cutlass_example_add_executable( 14_ampere_tf32_tensorop_gemm ampere_tf32_tensorop_gemm.cu ) - - cutlass_example_add_executable( - 14_ampere_tf32_tensorop_gemm_cute - ampere_tf32_tensorop_gemm_cute.cu - ) endif() + +cutlass_example_add_executable( + 14_ampere_tf32_tensorop_gemm_cute + ampere_tf32_tensorop_gemm_cute.cu +) diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu index 0916300a85..839f2026b3 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu @@ -55,6 +55,10 @@ */ +#if defined(CUTLASS_ENABLE_SYCL) +#define SYCLCOMPAT_PROFILING_ENABLED +#endif + #include #include "cutlass/cutlass.h" @@ -70,8 +74,9 @@ #include "cutlass/util/packed_stride.hpp" #include "cutlass/util/reference/device/gemm_complex.h" #include "cutlass/util/reference/device/tensor_compare.h" +#if !defined(CUTLASS_ENABLE_SYCL) #include "cutlass/util/reference/device/tensor_fill.h" - +#endif #include "helper.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -184,9 +189,46 @@ bool initialize_block( scope_min = -8; } +#if defined(CUTLASS_ENABLE_SYCL) + using FloatType = typename std::conditional< + (sizeof(Element) > 4), + double, + float>::type; + + using IntType = typename std::conditional< + (sizeof(Element) > 4), + int64_t, + int>::type; + + srand(seed); + Element range = static_cast(scope_max - scope_min); + Element max = static_cast(scope_max); + int int_scale = 0; + + Element float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits + Element float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale); + + // Random values are cast to integer after scaling by a power of two to facilitate error + // testing + auto const size = block.size(); + auto h_vector = std::vector(size); + for (int j = 0; j < size; ++j) { + FloatType rnd = rand() / double(RAND_MAX); + rnd = max - range * rnd; + + if (int_scale >= 0) { + rnd = FloatType(IntType(std::llround(rnd * float_scale_up))); + h_vector[j] = Element(IntType(rnd * float_scale_down)); + } + else { + h_vector[j] = Element(rnd); + } + } + syclcompat::memcpy(block.get(), h_vector.data(), size); +#else cutlass::reference::device::BlockFillRandomUniform( block.get(), block.size(), seed, scope_max, scope_min, 0); - +#endif return true; } @@ -267,12 +309,16 @@ struct ExampleRunner { M * N // batch_stride_D ); +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait_and_throw(); +#else cudaError_t result = cudaDeviceSynchronize(); if (result != cudaSuccess) { std::cerr << "Reference kernel failed. Last CUDA error: " << cudaGetErrorString(result) << std::endl; return false; } +#endif // Check if output from CUTLASS kernel and reference kernel are equal or not bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size()); @@ -367,6 +413,7 @@ int main(int argc, char const **args) { // in CUDA 11.0. // // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples. +#if !defined(CUTLASS_ENABLE_SYCL) if (!(__CUDACC_VER_MAJOR__ >= 11)) { std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl; return 0; @@ -385,6 +432,7 @@ int main(int argc, char const **args) { << std::endl; return 0; } +#endif // // Parse options diff --git a/examples/common/helper.h b/examples/common/helper.h index 926d867f2b..b8f0068995 100644 --- a/examples/common/helper.h +++ b/examples/common/helper.h @@ -63,51 +63,79 @@ } -#if !defined(CUTLASS_ENABLE_SYCL) /** * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream */ struct GpuTimer { - cudaStream_t _stream_id; +#if defined(CUTLASS_ENABLE_SYCL) + using cudaStream_t = int; + SyclEvent _start; + SyclEvent _stop; +#else cudaEvent_t _start; cudaEvent_t _stop; +#endif + cudaStream_t _stream_id; /// Constructor GpuTimer() : _stream_id(0) { +#if defined(CUTLASS_ENABLE_SYCL) + _start = SyclEvent{}; + _stop = SyclEvent{}; +#else CUDA_CHECK(cudaEventCreate(&_start)); CUDA_CHECK(cudaEventCreate(&_stop)); +#endif } /// Destructor ~GpuTimer() { +#if defined(CUTLASS_ENABLE_SYCL) + syclEventDestroy(_start); + syclEventDestroy(_stop); +#else CUDA_CHECK(cudaEventDestroy(_start)); CUDA_CHECK(cudaEventDestroy(_stop)); +#endif } /// Start the timer for a given stream (defaults to the default stream) void start(cudaStream_t stream_id = 0) { _stream_id = stream_id; +#if defined(CUTLASS_ENABLE_SYCL) + syclEventRecord(_start); +#else CUDA_CHECK(cudaEventRecord(_start, _stream_id)); +#endif } /// Stop the timer void stop() { +#if defined(CUTLASS_ENABLE_SYCL) + syclEventRecord(_stop); +#else CUDA_CHECK(cudaEventRecord(_stop, _stream_id)); +#endif } /// Return the elapsed time (in milliseconds) float elapsed_millis() { +#if defined(CUTLASS_ENABLE_SYCL) + float elapsed = 0.0; + syclEventSynchronize(_start, _stop); + syclEventElapsedTime(&elapsed, _start, _stop); + return elapsed; +#else float elapsed = 0.0; CUDA_CHECK(cudaEventSynchronize(_stop)); CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop)); return elapsed; +#endif } }; - -#endif diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h index 2666d921c1..a95bc82c31 100644 --- a/include/cutlass/tfloat32.h +++ b/include/cutlass/tfloat32.h @@ -107,7 +107,7 @@ struct alignas(4) tfloat32_t { // explicit tfloat32_t(int x) { tfloat32_t(int x) { float flt = static_cast(x); - #if defined(__CUDA_ARCH__) + #if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL) storage = reinterpret_cast(flt); #else std::memcpy(&storage, &flt, sizeof(storage)); @@ -122,7 +122,7 @@ struct alignas(4) tfloat32_t { // of the mantissa. unsigned bits = (storage & ~0x1fffu); - #if defined(__CUDA_ARCH__) + #if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL) return reinterpret_cast(bits); #else float flt;