Skip to content

Commit

Permalink
Enable Cute Unit tests (#113)
Browse files Browse the repository at this point in the history
Enables Cute unit tests.

---------

Co-authored-by: Alejandro Acosta <[email protected]>
  • Loading branch information
AD2605 and aacostadiaz authored Oct 15, 2024
1 parent d4f99c9 commit 37566bb
Show file tree
Hide file tree
Showing 19 changed files with 1,313 additions and 364 deletions.
Empty file.
2 changes: 1 addition & 1 deletion include/cute/arch/mma_sm75.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#include <cute/arch/mma.hpp>

// Config
#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) || defined(SYCL_NVIDIA_TARGET)
# define CUTE_ARCH_MMA_SM75_SUPPORTED
# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) || \
(defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 750))
Expand Down
1 change: 1 addition & 0 deletions test/unit/common/cutlass_unit_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,5 @@ int CutlassUnitTestProblemCount();
#include <cutlass/numeric_types.h>
#include <cutlass/trace.h>

#include "util.hpp"
/////////////////////////////////////////////////////////////////////////////////////////////////
84 changes: 61 additions & 23 deletions test/unit/common/filter_architecture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@

#include "cutlass_unit_test.h"

#if defined(CUTLASS_ENABLE_SYCL)
#include <sycl/sycl.hpp>
#endif

#if !defined(CUTLASS_ENABLE_SYCL)

/////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -81,31 +85,10 @@ std::ostream &operator<<(std::ostream &out, cudaDeviceProp const &deviceProperti

/// Sets flags for Unit test
void FilterArchitecture() {
#if !defined(CUTLASS_ENABLE_SYCL)
// Default flags can be overwritten by --gtest_filter from commandline

int const kMaxDevice = 999;

cudaError_t err;

int cudaDeviceId;
err = cudaGetDevice(&cudaDeviceId);
if (cudaSuccess != err) {
std::cerr << "*** Error: Could not detect active GPU device ID"
<< " [" << cudaGetErrorString(err) << "]" << std::endl;
exit(1);
}

cudaDeviceProp deviceProperties;
err = cudaGetDeviceProperties(&deviceProperties, cudaDeviceId);
if (cudaSuccess != err) {
std::cerr << "*** Error: Could not get device properties for GPU " << cudaDeviceId << " ["
<< cudaGetErrorString(err) << "]" << std::endl;
exit(1);
}

int deviceMajorMinor = deviceProperties.major * 10 + deviceProperties.minor;

// Defines text filters for each GEMM kernel based on minimum supported compute capability
struct {

Expand All @@ -127,11 +110,66 @@ void FilterArchitecture() {
{ "SM80*", 80, kMaxDevice},
{ "SM89*", 89, 89},
{ "SM90*", 90, 90},
{ "IntelPVC", 0, 0},
{ 0, 0, false }
};


// Set negative test filters
#if defined(CUTLASS_ENABLE_SYCL)
using namespace sycl::ext::oneapi::experimental;

// We might be adding PVC unit tests someday
std::map<architecture, int> arch_map {
{architecture::nvidia_gpu_sm_50, 50},
{architecture::nvidia_gpu_sm_52, 52},
{architecture::nvidia_gpu_sm_53, 53},
{architecture::nvidia_gpu_sm_60, 60},
{architecture::nvidia_gpu_sm_61, 61},
{architecture::nvidia_gpu_sm_62, 62},
{architecture::nvidia_gpu_sm_70, 70},
{architecture::nvidia_gpu_sm_72, 72},
{architecture::nvidia_gpu_sm_75, 75},
{architecture::nvidia_gpu_sm_80, 80},
{architecture::nvidia_gpu_sm_86, 86},
{architecture::nvidia_gpu_sm_89, 89},
{architecture::nvidia_gpu_sm_90, 90},
{architecture::nvidia_gpu_sm_90a, 90},
{architecture::intel_gpu_pvc, 0}
};
auto device_architecture =
syclcompat::get_default_queue().get_device().get_info<info::device::architecture>();
if (device_architecture == architecture::unknown) {
throw std::runtime_error("Encountered Unknown architecture.");
}

if(auto search_result = arch_map.find(device_architecture); search_result == arch_map.end()) {
throw std::runtime_error("Detected Architecture is not supported.");
}

const int deviceMajorMinor = arch_map[device_architecture];

#else
cudaError_t err;

int cudaDeviceId;
err = cudaGetDevice(&cudaDeviceId);
if (cudaSuccess != err) {
std::cerr << "*** Error: Could not detect active GPU device ID"
<< " [" << cudaGetErrorString(err) << "]" << std::endl;
exit(1);
}

cudaDeviceProp deviceProperties;
err = cudaGetDeviceProperties(&deviceProperties, cudaDeviceId);
if (cudaSuccess != err) {
std::cerr << "*** Error: Could not get device properties for GPU " << cudaDeviceId << " ["
<< cudaGetErrorString(err) << "]" << std::endl;
exit(1);
}

int deviceMajorMinor = deviceProperties.major * 10 + deviceProperties.minor;
#endif

std::stringstream ss;
ss << "-";
for (int i = 0, j = 0; test_filters[i].filter; ++i) {
Expand All @@ -144,7 +182,7 @@ void FilterArchitecture() {
}

::testing::GTEST_FLAG(filter) = ss.str();
#endif
// Set negative test filters
}

/////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
174 changes: 174 additions & 0 deletions test/unit/common/util.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/

#if defined(CUTLASS_ENABLE_SYCL)
#include <syclcompat/syclcompat.hpp>

#include <vector>
#else
#if defined(__CUDACC__)
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#endif
#endif

#if defined(CUTLASS_ENABLE_SYCL)
// Move to SYCLCompat ?
namespace cutlass {

namespace kernel {
template<typename T>
void memset(T* ptr, T init_val, std::size_t num_elements) {
auto global_id = syclcompat::global_id::x();
if (global_id < num_elements) {
ptr[global_id] = init_val;
}
}
}

template <typename T>
class device_vector;

template <typename T>
class host_vector {
public:
host_vector(std::size_t num_elements) { vec.resize(num_elements); }
host_vector(std::size_t num_elements, T init_val) {
vec = std::move(std::vector<T>(num_elements, init_val));
}

T* data() { return vec.data(); }
T& operator[](std::size_t index) {return vec[index]; }
std::size_t size() const { return vec.size(); }

host_vector<T>& operator=(device_vector<T>);
host_vector(device_vector<T>);

private:
std::vector<T> vec;
};

template <typename T>
class device_vector {
public:
device_vector(std::size_t num_elements) {
n_elements = num_elements;
dev_ptr = make_shared(num_elements);
}

device_vector(std::size_t num_elements, T init_value) {
n_elements = num_elements;
dev_ptr = make_shared(num_elements);
syclcompat::launch<kernel::memset<T>>(sycl::range<1>(num_elements),
sycl::range<1>(32), dev_ptr.get(), init_value, num_elements);
syclcompat::wait_and_throw();
}

device_vector<T>& operator=(host_vector<T> host_vec);
device_vector(host_vector<T>);

T* data() { return dev_ptr.get(); }

std::size_t size() const {return n_elements; }

private:
T* safe_malloc(std::size_t size) {
T* ptr = syclcompat::malloc<T>(size * sizeof(T));
if(!ptr) {
throw std::runtime_error("Allocation Failed.");
}
return ptr;
}
std::shared_ptr<T> make_shared(std::size_t size) {
return std::shared_ptr<T>(safe_malloc(size), [=](T* ptr) {
if (ptr != nullptr) {
syclcompat::wait_and_throw();
syclcompat::free(ptr);
}
});
}
std::shared_ptr<T> dev_ptr;
std::size_t n_elements;
};

template<typename T>
host_vector<T>& host_vector<T>::operator=(device_vector<T> device_vec) {
syclcompat::wait_and_throw();
host_vector host_vec(device_vec.size());
syclcompat::memcpy(host_vec.data(), device_vec.data(),
device_vec.size() * sizeof(T));
*this = host_vec;
return *this;
}

template<typename T>
host_vector<T>::host_vector(device_vector<T> device_vec) {
syclcompat::wait_and_throw();
host_vector host_vec(device_vec.size());
syclcompat::memcpy(host_vec.data(), device_vec.data(),
device_vec.size() * sizeof(T));
*this = host_vec;
}

template<typename T>
device_vector<T>& device_vector<T>::operator=(host_vector<T> host_vec) {
device_vector device_vec(host_vec.size());
syclcompat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
syclcompat::wait_and_throw();
*this = device_vec;
return *this;
}

template<typename T>
device_vector<T>::device_vector(host_vector<T> host_vec) {
device_vector device_vec(host_vec.size());
syclcompat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
syclcompat::wait_and_throw();
*this = device_vec;
}

} // namespace cutlass
#endif

#if defined(CUTLASS_ENABLE_SYCL)
template<typename T>
using host_vector = cutlass::host_vector<T>;
template<typename T>
using device_vector = cutlass::device_vector<T>;
#else
#if defined(__CUDACC__)
template<typename T>
using host_vector = thrust::host_vector<T>;
template<typename T>
using device_vector = thrust::device_vector<T>;
#endif
#endif
44 changes: 32 additions & 12 deletions test/unit/cute/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,41 +60,61 @@ if (NOT CUTLASS_ENABLE_SYCL)
)

else()

add_subdirectory(core)
add_subdirectory(layout)

if (SYCL_NVIDIA_TARGET)
# Enable Cute tests for the Nvidia backend as a part of #113
add_subdirectory(msvc_compilation)

if(SYCL_NVIDIA_TARGET)

add_subdirectory(volta)
add_subdirectory(turing)
add_subdirectory(ampere)
# add_subdirectory(hopper) // Hopper test support to come later once complete support in DPCPP

endif()

if(SYCL_NVIDIA_TARGET)
add_custom_target(
cutlass_test_unit_cute
DEPENDS
cutlass_test_unit_cute_layout
cutlass_test_unit_cute_core
)

cutlass_test_unit_cute_volta
cutlass_test_unit_cute_turing
cutlass_test_unit_cute_ampere
#cutlass_test_unit_cute_hopper
cutlass_test_unit_cute_msvc_compilation
)

add_custom_target(
test_unit_cute
DEPENDS
test_unit_cute_layout
test_unit_cute_core
)
endif()
if (SYCL_INTEL_TARGET)

test_unit_cute_volta
test_unit_cute_ampere
test_unit_cute_turing
#test_unit_cute_hopper
test_unit_cute_msvc_compilation
)
else()
add_custom_target(
cutlass_test_unit_cute
DEPENDS
cutlass_test_unit_cute_layout
cutlass_test_unit_cute_core
)
cutlass_test_unit_cute_msvc_compilation
)

add_custom_target(
test_unit_cute
DEPENDS
test_unit_cute_layout
test_unit_cute_core
)
test_unit_cute_msvc_compilation
#Intel Tests
)

endif()
endif()
Loading

0 comments on commit 37566bb

Please sign in to comment.