Enable Cute Unit tests (#113)

Enables Cute unit tests. --------- Co-authored-by: Alejandro Acosta <[email protected]>
codeplaysoftware · Oct 15, 2024 · 37566bb · 37566bb
1 parent d4f99c9
commit 37566bb
Show file tree

Hide file tree

Showing 19 changed files with 1,313 additions and 364 deletions.
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp
diff --git a/include/cute/arch/mma_sm75.hpp b/include/cute/arch/mma_sm75.hpp
@@ -35,7 +35,7 @@
 #include <cute/arch/mma.hpp>
 
 // Config
-#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) || defined(SYCL_NVIDIA_TARGET)
 #  define CUTE_ARCH_MMA_SM75_SUPPORTED
 #  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) || \
       (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 750))

diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h
@@ -103,4 +103,5 @@ int CutlassUnitTestProblemCount();
 #include <cutlass/numeric_types.h>
 #include <cutlass/trace.h>
 
+#include "util.hpp"
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp
@@ -35,6 +35,10 @@
 
 #include "cutlass_unit_test.h"
 
+#if defined(CUTLASS_ENABLE_SYCL)
+#include <sycl/sycl.hpp>
+#endif
+
 #if !defined(CUTLASS_ENABLE_SYCL)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -81,31 +85,10 @@ std::ostream &operator<<(std::ostream &out, cudaDeviceProp const &deviceProperti
 
 /// Sets flags for Unit test
 void FilterArchitecture() {
-#if !defined(CUTLASS_ENABLE_SYCL)
   // Default flags can be overwritten by --gtest_filter from commandline
 
   int const kMaxDevice = 999;
 
-  cudaError_t err;
-
-  int cudaDeviceId;
-  err = cudaGetDevice(&cudaDeviceId);
-  if (cudaSuccess != err) {
-    std::cerr << "*** Error: Could not detect active GPU device ID"
-              << " [" << cudaGetErrorString(err) << "]" << std::endl;
-    exit(1);
-  }
-
-  cudaDeviceProp deviceProperties;
-  err = cudaGetDeviceProperties(&deviceProperties, cudaDeviceId);
-  if (cudaSuccess != err) {
-    std::cerr << "*** Error: Could not get device properties for GPU " << cudaDeviceId << " ["
-              << cudaGetErrorString(err) << "]" << std::endl;
-    exit(1);
-  }
-
-  int deviceMajorMinor = deviceProperties.major * 10 + deviceProperties.minor;
-
   // Defines text filters for each GEMM kernel based on minimum supported compute capability
   struct {
 
@@ -127,11 +110,66 @@ void FilterArchitecture() {
     { "SM80*",                      80, kMaxDevice},
     { "SM89*",                      89, 89},
     { "SM90*",                      90, 90},
+    { "IntelPVC",                   0,  0},
     { 0, 0, false }
   };
 
 
-  // Set negative test filters
+#if defined(CUTLASS_ENABLE_SYCL)
+  using namespace sycl::ext::oneapi::experimental;
+
+  // We might be adding PVC unit tests someday
+  std::map<architecture, int> arch_map {
+    {architecture::nvidia_gpu_sm_50, 50},
+    {architecture::nvidia_gpu_sm_52, 52},
+    {architecture::nvidia_gpu_sm_53, 53},
+    {architecture::nvidia_gpu_sm_60, 60},
+    {architecture::nvidia_gpu_sm_61, 61},
+    {architecture::nvidia_gpu_sm_62, 62},
+    {architecture::nvidia_gpu_sm_70, 70},
+    {architecture::nvidia_gpu_sm_72, 72},
+    {architecture::nvidia_gpu_sm_75, 75},
+    {architecture::nvidia_gpu_sm_80, 80},
+    {architecture::nvidia_gpu_sm_86, 86},
+    {architecture::nvidia_gpu_sm_89, 89},
+    {architecture::nvidia_gpu_sm_90, 90},
+    {architecture::nvidia_gpu_sm_90a, 90},
+    {architecture::intel_gpu_pvc, 0}
+  };
+  auto device_architecture = 
+        syclcompat::get_default_queue().get_device().get_info<info::device::architecture>();
+  if (device_architecture == architecture::unknown) {
+    throw std::runtime_error("Encountered Unknown architecture.");
+  }
+
+  if(auto search_result = arch_map.find(device_architecture); search_result == arch_map.end()) {
+    throw std::runtime_error("Detected Architecture is not supported.");
+  }
+
+  const int deviceMajorMinor = arch_map[device_architecture];
+
+#else
+  cudaError_t err;
+
+  int cudaDeviceId;
+  err = cudaGetDevice(&cudaDeviceId);
+  if (cudaSuccess != err) {
+    std::cerr << "*** Error: Could not detect active GPU device ID"
+              << " [" << cudaGetErrorString(err) << "]" << std::endl;
+    exit(1);
+  }
+
+  cudaDeviceProp deviceProperties;
+  err = cudaGetDeviceProperties(&deviceProperties, cudaDeviceId);
+  if (cudaSuccess != err) {
+    std::cerr << "*** Error: Could not get device properties for GPU " << cudaDeviceId << " ["
+              << cudaGetErrorString(err) << "]" << std::endl;
+    exit(1);
+  }
+
+  int deviceMajorMinor = deviceProperties.major * 10 + deviceProperties.minor;
+#endif
+
   std::stringstream ss;
   ss << "-";
   for (int i = 0, j = 0; test_filters[i].filter; ++i) {
@@ -144,7 +182,7 @@ void FilterArchitecture() {
   }
 
   ::testing::GTEST_FLAG(filter) = ss.str();
-#endif
+  // Set negative test filters
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////

diff --git a/test/unit/common/util.hpp b/test/unit/common/util.hpp
@@ -0,0 +1,174 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#if defined(CUTLASS_ENABLE_SYCL)
+#include <syclcompat/syclcompat.hpp>
+
+#include <vector>
+#else
+#if defined(__CUDACC__)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+#endif
+
+#if defined(CUTLASS_ENABLE_SYCL)
+// Move to SYCLCompat ?
+namespace cutlass {
+
+  namespace kernel {
+    template<typename T>
+    void memset(T* ptr, T init_val, std::size_t num_elements) {
+      auto global_id = syclcompat::global_id::x();
+      if (global_id  < num_elements) {
+        ptr[global_id] = init_val;
+      }
+    }
+  }
+
+template <typename T>
+class device_vector;
+
+template <typename T>
+class host_vector {
+  public:
+  host_vector(std::size_t num_elements) { vec.resize(num_elements); }
+  host_vector(std::size_t num_elements, T init_val) {
+    vec = std::move(std::vector<T>(num_elements, init_val));
+  }
+
+  T* data() { return vec.data(); }
+  T& operator[](std::size_t index) {return vec[index]; }
+  std::size_t size() const { return vec.size(); }
+
+  host_vector<T>& operator=(device_vector<T>);
+  host_vector(device_vector<T>);
+
+ private:
+  std::vector<T> vec;
+};
+
+template <typename T>
+class device_vector {
+  public:
+  device_vector(std::size_t num_elements) {
+    n_elements = num_elements;
+    dev_ptr = make_shared(num_elements);
+  }
+
+  device_vector(std::size_t num_elements, T init_value) { 
+    n_elements = num_elements;
+    dev_ptr = make_shared(num_elements);
+    syclcompat::launch<kernel::memset<T>>(sycl::range<1>(num_elements), 
+      sycl::range<1>(32), dev_ptr.get(), init_value, num_elements);
+    syclcompat::wait_and_throw(); 
+  }
+
+  device_vector<T>& operator=(host_vector<T> host_vec);
+  device_vector(host_vector<T>);
+
+  T* data() { return dev_ptr.get(); }
+
+  std::size_t size() const {return n_elements; }
+
+ private:
+  T* safe_malloc(std::size_t size) {
+    T* ptr = syclcompat::malloc<T>(size * sizeof(T));
+    if(!ptr) {
+      throw std::runtime_error("Allocation Failed.");
+    }
+    return ptr;
+  }
+  std::shared_ptr<T> make_shared(std::size_t size) {
+    return std::shared_ptr<T>(safe_malloc(size), [=](T* ptr) {
+      if (ptr != nullptr) {
+        syclcompat::wait_and_throw();
+        syclcompat::free(ptr);
+      }
+    });
+  }
+  std::shared_ptr<T> dev_ptr;
+  std::size_t n_elements;
+};
+
+template<typename T>
+host_vector<T>& host_vector<T>::operator=(device_vector<T> device_vec) {
+    syclcompat::wait_and_throw();
+    host_vector host_vec(device_vec.size());
+    syclcompat::memcpy(host_vec.data(), device_vec.data(),
+                       device_vec.size() * sizeof(T));
+    *this = host_vec;
+    return *this;
+}
+
+template<typename T>
+host_vector<T>::host_vector(device_vector<T> device_vec) {
+    syclcompat::wait_and_throw();
+    host_vector host_vec(device_vec.size());
+    syclcompat::memcpy(host_vec.data(), device_vec.data(),
+                       device_vec.size() * sizeof(T));
+    *this = host_vec;
+}
+
+template<typename T>
+device_vector<T>& device_vector<T>::operator=(host_vector<T> host_vec) {
+    device_vector device_vec(host_vec.size());
+    syclcompat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
+    syclcompat::wait_and_throw();
+    *this = device_vec;
+    return *this;
+}
+
+template<typename T>
+device_vector<T>::device_vector(host_vector<T> host_vec) {
+    device_vector device_vec(host_vec.size());
+    syclcompat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
+    syclcompat::wait_and_throw();
+    *this = device_vec;
+}
+
+}  // namespace cutlass
+#endif
+
+#if defined(CUTLASS_ENABLE_SYCL)
+  template<typename T>
+  using host_vector = cutlass::host_vector<T>;
+  template<typename T>
+  using device_vector = cutlass::device_vector<T>;
+#else
+#if defined(__CUDACC__)
+  template<typename T>
+  using host_vector = thrust::host_vector<T>;
+  template<typename T>
+  using device_vector = thrust::device_vector<T>;
+#endif
+#endif
diff --git a/test/unit/cute/CMakeLists.txt b/test/unit/cute/CMakeLists.txt
@@ -60,41 +60,61 @@ if (NOT CUTLASS_ENABLE_SYCL)
     )
 
 else()
-  
+
   add_subdirectory(core)
   add_subdirectory(layout)
-
-  if (SYCL_NVIDIA_TARGET)
-    # Enable Cute tests for the Nvidia backend as a part of #113
+  add_subdirectory(msvc_compilation)
+
+  if(SYCL_NVIDIA_TARGET)
+
+    add_subdirectory(volta)
+    add_subdirectory(turing)
+    add_subdirectory(ampere)
+    # add_subdirectory(hopper) // Hopper test support to come later once complete support in DPCPP
+
+  endif()
+
+  if(SYCL_NVIDIA_TARGET)
     add_custom_target(
       cutlass_test_unit_cute
       DEPENDS
       cutlass_test_unit_cute_layout
       cutlass_test_unit_cute_core
-    )
-
+      cutlass_test_unit_cute_volta
+      cutlass_test_unit_cute_turing
+      cutlass_test_unit_cute_ampere
+      #cutlass_test_unit_cute_hopper
+      cutlass_test_unit_cute_msvc_compilation
+      )
+
     add_custom_target(
       test_unit_cute
       DEPENDS
       test_unit_cute_layout
       test_unit_cute_core
-    )
-  endif()
-  if (SYCL_INTEL_TARGET)
-
+      test_unit_cute_volta
+      test_unit_cute_ampere
+      test_unit_cute_turing
+      #test_unit_cute_hopper
+      test_unit_cute_msvc_compilation
+      )
+  else()
   add_custom_target(
     cutlass_test_unit_cute
     DEPENDS
     cutlass_test_unit_cute_layout
     cutlass_test_unit_cute_core
-  )
+    cutlass_test_unit_cute_msvc_compilation
+    )
 
   add_custom_target(
     test_unit_cute
     DEPENDS
     test_unit_cute_layout
     test_unit_cute_core
-  )
+    test_unit_cute_msvc_compilation
+    #Intel Tests
+    )
 
   endif()
 endif()