Skip to content

Commit

Permalink
Migrate example to sycl
Browse files Browse the repository at this point in the history
  • Loading branch information
aacostadiaz committed Aug 2, 2024
1 parent 4c0c974 commit 521d420
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 19 deletions.
17 changes: 6 additions & 11 deletions examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,14 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

if (CUTLASS_ENABLE_SYCL)
cutlass_example_add_executable(
14_ampere_tf32_tensorop_gemm_cute
ampere_tf32_tensorop_gemm_cute.cu
)
else()
if (NOT CUTLASS_ENABLE_SYCL)
cutlass_example_add_executable(
14_ampere_tf32_tensorop_gemm
ampere_tf32_tensorop_gemm.cu
)

cutlass_example_add_executable(
14_ampere_tf32_tensorop_gemm_cute
ampere_tf32_tensorop_gemm_cute.cu
)
endif()

cutlass_example_add_executable(
14_ampere_tf32_tensorop_gemm_cute
ampere_tf32_tensorop_gemm_cute.cu
)
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
*/

#if defined(CUTLASS_ENABLE_SYCL)
#define SYCLCOMPAT_PROFILING_ENABLED
#endif

#include <iostream>

#include "cutlass/cutlass.h"
Expand All @@ -70,8 +74,9 @@
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/reference/device/gemm_complex.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#if !defined(CUTLASS_ENABLE_SYCL)
#include "cutlass/util/reference/device/tensor_fill.h"

#endif
#include "helper.h"

/////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -184,9 +189,46 @@ bool initialize_block(
scope_min = -8;
}

#if defined(CUTLASS_ENABLE_SYCL)
using FloatType = typename std::conditional<
(sizeof(Element) > 4),
double,
float>::type;

using IntType = typename std::conditional<
(sizeof(Element) > 4),
int64_t,
int>::type;

srand(seed);
Element range = static_cast<FloatType>(scope_max - scope_min);
Element max = static_cast<FloatType>(scope_max);
int int_scale = 0;

Element float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
Element float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);

// Random values are cast to integer after scaling by a power of two to facilitate error
// testing
auto const size = block.size();
auto h_vector = std::vector<Element>(size);
for (int j = 0; j < size; ++j) {
FloatType rnd = rand() / double(RAND_MAX);
rnd = max - range * rnd;

if (int_scale >= 0) {
rnd = FloatType(IntType(std::llround(rnd * float_scale_up)));
h_vector[j] = Element(IntType(rnd * float_scale_down));
}
else {
h_vector[j] = Element(rnd);
}
}
syclcompat::memcpy<Element>(block.get(), h_vector.data(), size);
#else
cutlass::reference::device::BlockFillRandomUniform(
block.get(), block.size(), seed, scope_max, scope_min, 0);

#endif
return true;
}

Expand Down Expand Up @@ -267,12 +309,16 @@ struct ExampleRunner {
M * N // batch_stride_D
);

#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::wait_and_throw();
#else
cudaError_t result = cudaDeviceSynchronize();
if (result != cudaSuccess) {
std::cerr << "Reference kernel failed. Last CUDA error: "
<< cudaGetErrorString(result) << std::endl;
return false;
}
#endif

// Check if output from CUTLASS kernel and reference kernel are equal or not
bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
Expand Down Expand Up @@ -367,6 +413,7 @@ int main(int argc, char const **args) {
// in CUDA 11.0.
//
// CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
#if !defined(CUTLASS_ENABLE_SYCL)
if (!(__CUDACC_VER_MAJOR__ >= 11)) {
std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
return 0;
Expand All @@ -385,6 +432,7 @@ int main(int argc, char const **args) {
<< std::endl;
return 0;
}
#endif

//
// Parse options
Expand Down
36 changes: 32 additions & 4 deletions examples/common/helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,51 +63,79 @@
}


#if !defined(CUTLASS_ENABLE_SYCL)
/**
* GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
*/
struct GpuTimer
{
cudaStream_t _stream_id;
#if defined(CUTLASS_ENABLE_SYCL)
using cudaStream_t = int;
SyclEvent _start;
SyclEvent _stop;
#else
cudaEvent_t _start;
cudaEvent_t _stop;
#endif
cudaStream_t _stream_id;

/// Constructor
GpuTimer() : _stream_id(0)
{
#if defined(CUTLASS_ENABLE_SYCL)
_start = SyclEvent{};
_stop = SyclEvent{};
#else
CUDA_CHECK(cudaEventCreate(&_start));
CUDA_CHECK(cudaEventCreate(&_stop));
#endif
}

/// Destructor
~GpuTimer()
{
#if defined(CUTLASS_ENABLE_SYCL)
syclEventDestroy(_start);
syclEventDestroy(_stop);
#else
CUDA_CHECK(cudaEventDestroy(_start));
CUDA_CHECK(cudaEventDestroy(_stop));
#endif
}

/// Start the timer for a given stream (defaults to the default stream)
void start(cudaStream_t stream_id = 0)
{
_stream_id = stream_id;
#if defined(CUTLASS_ENABLE_SYCL)
syclEventRecord(_start);
#else
CUDA_CHECK(cudaEventRecord(_start, _stream_id));
#endif
}

/// Stop the timer
void stop()
{
#if defined(CUTLASS_ENABLE_SYCL)
syclEventRecord(_stop);
#else
CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
#endif
}

/// Return the elapsed time (in milliseconds)
float elapsed_millis()
{
#if defined(CUTLASS_ENABLE_SYCL)
float elapsed = 0.0;
syclEventSynchronize(_start, _stop);
syclEventElapsedTime(&elapsed, _start, _stop);
return elapsed;
#else
float elapsed = 0.0;
CUDA_CHECK(cudaEventSynchronize(_stop));
CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
return elapsed;
#endif
}
};

#endif
4 changes: 2 additions & 2 deletions include/cutlass/tfloat32.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ struct alignas(4) tfloat32_t {
// explicit tfloat32_t(int x) {
tfloat32_t(int x) {
float flt = static_cast<float>(x);
#if defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL)
storage = reinterpret_cast<uint32_t const &>(flt);
#else
std::memcpy(&storage, &flt, sizeof(storage));
Expand All @@ -122,7 +122,7 @@ struct alignas(4) tfloat32_t {
// of the mantissa.
unsigned bits = (storage & ~0x1fffu);

#if defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL)
return reinterpret_cast<float const &>(bits);
#else
float flt;
Expand Down

0 comments on commit 521d420

Please sign in to comment.