From 521d420e350d2255e6f0221235cfa724d0900419 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Mon, 29 Jul 2024 14:17:03 +0100
Subject: [PATCH] Migrate example to sycl

---
 .../CMakeLists.txt                            | 17 +++---
 .../ampere_tf32_tensorop_gemm_cute.cu         | 52 ++++++++++++++++++-
 examples/common/helper.h                      | 36 +++++++++++--
 include/cutlass/tfloat32.h                    |  4 +-
 4 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
index e6bb308b47..876afdfba7 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
+++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
@@ -27,19 +27,14 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-if (CUTLASS_ENABLE_SYCL)
-  cutlass_example_add_executable(
-    14_ampere_tf32_tensorop_gemm_cute
-    ampere_tf32_tensorop_gemm_cute.cu
-    )
-else()
+if (NOT CUTLASS_ENABLE_SYCL)
   cutlass_example_add_executable(
     14_ampere_tf32_tensorop_gemm
     ampere_tf32_tensorop_gemm.cu
     )
-
-  cutlass_example_add_executable(
-    14_ampere_tf32_tensorop_gemm_cute
-    ampere_tf32_tensorop_gemm_cute.cu
-    )
 endif()
+
+cutlass_example_add_executable(
+  14_ampere_tf32_tensorop_gemm_cute
+  ampere_tf32_tensorop_gemm_cute.cu
+)
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
index 0916300a85..839f2026b3 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
@@ -55,6 +55,10 @@
 
 */
 
+#if defined(CUTLASS_ENABLE_SYCL)
+#define SYCLCOMPAT_PROFILING_ENABLED
+#endif
+
 #include <iostream>
 
 #include "cutlass/cutlass.h"
@@ -70,8 +74,9 @@
 #include "cutlass/util/packed_stride.hpp"
 #include "cutlass/util/reference/device/gemm_complex.h"
 #include "cutlass/util/reference/device/tensor_compare.h"
+#if !defined(CUTLASS_ENABLE_SYCL)
 #include "cutlass/util/reference/device/tensor_fill.h"
-
+#endif
 #include "helper.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -184,9 +189,46 @@ bool initialize_block(
     scope_min = -8;
   }
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  using FloatType = typename std::conditional<
+  (sizeof(Element) > 4),
+  double,
+  float>::type;
+
+  using IntType = typename std::conditional<
+    (sizeof(Element) > 4),
+    int64_t,
+    int>::type;
+
+  srand(seed);
+  Element range = static_cast<FloatType>(scope_max - scope_min);
+  Element max = static_cast<FloatType>(scope_max);
+  int int_scale = 0;
+
+  Element float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
+  Element float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
+
+  // Random values are cast to integer after scaling by a power of two to facilitate error
+  // testing
+  auto const size = block.size();
+  auto h_vector = std::vector<Element>(size);
+  for (int j = 0; j < size; ++j) {
+    FloatType rnd = rand() / double(RAND_MAX);
+    rnd = max - range * rnd;
+
+    if (int_scale >= 0) {
+      rnd = FloatType(IntType(std::llround(rnd * float_scale_up)));
+      h_vector[j] = Element(IntType(rnd * float_scale_down));
+    }
+    else {
+      h_vector[j] = Element(rnd);
+    }
+  }
+  syclcompat::memcpy<Element>(block.get(), h_vector.data(), size);
+#else
   cutlass::reference::device::BlockFillRandomUniform(
           block.get(), block.size(), seed, scope_max, scope_min, 0);
-
+#endif
   return true;
 }
 
@@ -267,12 +309,16 @@ struct ExampleRunner {
             M * N  // batch_stride_D
     );
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    syclcompat::wait_and_throw();
+#else
     cudaError_t result = cudaDeviceSynchronize();
     if (result != cudaSuccess) {
       std::cerr << "Reference kernel failed. Last CUDA error: "
                 << cudaGetErrorString(result) << std::endl;
       return false;
     }
+#endif
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
@@ -367,6 +413,7 @@ int main(int argc, char const **args) {
   // in CUDA 11.0.
   //
   // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
+#if !defined(CUTLASS_ENABLE_SYCL)
   if (!(__CUDACC_VER_MAJOR__ >= 11)) {
     std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
     return 0;
@@ -385,6 +432,7 @@ int main(int argc, char const **args) {
               << std::endl;
     return 0;
   }
+#endif
 
   //
   // Parse options
diff --git a/examples/common/helper.h b/examples/common/helper.h
index 926d867f2b..b8f0068995 100644
--- a/examples/common/helper.h
+++ b/examples/common/helper.h
@@ -63,51 +63,79 @@
   }
 
 
-#if !defined(CUTLASS_ENABLE_SYCL)
 /**
  * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
  */
 struct GpuTimer
 {
-    cudaStream_t _stream_id;
+#if defined(CUTLASS_ENABLE_SYCL)
+    using cudaStream_t = int;
+    SyclEvent _start;
+    SyclEvent _stop;
+#else
     cudaEvent_t _start;
     cudaEvent_t _stop;
+#endif
+    cudaStream_t _stream_id;
 
     /// Constructor
     GpuTimer() : _stream_id(0)
     {
+#if defined(CUTLASS_ENABLE_SYCL)
+        _start = SyclEvent{};
+        _stop = SyclEvent{};
+#else
         CUDA_CHECK(cudaEventCreate(&_start));
         CUDA_CHECK(cudaEventCreate(&_stop));
+#endif
     }
 
     /// Destructor
     ~GpuTimer()
     {
+#if defined(CUTLASS_ENABLE_SYCL)
+        syclEventDestroy(_start);
+        syclEventDestroy(_stop);
+#else
         CUDA_CHECK(cudaEventDestroy(_start));
         CUDA_CHECK(cudaEventDestroy(_stop));
+#endif
     }
 
     /// Start the timer for a given stream (defaults to the default stream)
     void start(cudaStream_t stream_id = 0)
     {
         _stream_id = stream_id;
+#if defined(CUTLASS_ENABLE_SYCL)
+        syclEventRecord(_start);
+#else
         CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+#endif
     }
 
     /// Stop the timer
     void stop()
     {
+#if defined(CUTLASS_ENABLE_SYCL)
+        syclEventRecord(_stop);
+#else
         CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
+#endif
     }
 
     /// Return the elapsed time (in milliseconds)
     float elapsed_millis()
     {
+#if defined(CUTLASS_ENABLE_SYCL)
+        float elapsed = 0.0;
+        syclEventSynchronize(_start, _stop);
+        syclEventElapsedTime(&elapsed, _start, _stop);
+        return elapsed;
+#else
         float elapsed = 0.0;
         CUDA_CHECK(cudaEventSynchronize(_stop));
         CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
         return elapsed;
+#endif
     }
 };
-
-#endif
diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h
index 2666d921c1..a95bc82c31 100644
--- a/include/cutlass/tfloat32.h
+++ b/include/cutlass/tfloat32.h
@@ -107,7 +107,7 @@ struct alignas(4) tfloat32_t {
 //  explicit tfloat32_t(int x) {
   tfloat32_t(int x) {
     float flt = static_cast<float>(x);
-    #if defined(__CUDA_ARCH__)
+    #if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL)
     storage = reinterpret_cast<uint32_t const &>(flt);
     #else
     std::memcpy(&storage, &flt, sizeof(storage));
@@ -122,7 +122,7 @@ struct alignas(4) tfloat32_t {
     // of the mantissa.
     unsigned bits = (storage & ~0x1fffu);
 
-    #if defined(__CUDA_ARCH__)    
+    #if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL)
     return reinterpret_cast<float const &>(bits);
     #else
     float flt;