Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sycl equivalent to cuda events for profiling #69

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions benchmarks/common/benchmark_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
*
**************************************************************************************************/

#if defined(CUTLASS_ENABLE_SYCL)
#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
#endif

#include "cutlass/gemm/device/gemm.h"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/gemm/device/gemm_universal.h"
Expand Down Expand Up @@ -208,6 +212,16 @@ struct BenchmarkRunner {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
auto [M, N, K, L] = problem_shape_MNKL;

#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED)
sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);
#endif

stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include <cstdlib>
#include <cstdio>

Expand Down Expand Up @@ -121,6 +123,13 @@ run(Gemm_Op gemm_op)

void test_gemm(int m, int n, int k)
{
sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

std::cout << "M = " << m << std::endl;
std::cout << "N = " << n << std::endl;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_1_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -295,12 +297,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
auto dimBlock = syclcompat::dim3(size(tC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));

syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(tA), TB, decltype(dB), decltype(sB), decltype(tB), TC, decltype(dC),
decltype(sC), decltype(tC), Alpha, Beta>>(dimGrid, dimBlock, prob_shape,
cta_tiler, A, dA, sA, tA, B, dB, sB, tB,
C, dC, sC, tC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a TN GEMM
Expand Down Expand Up @@ -341,12 +344,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
auto dimBlock = syclcompat::dim3(size(tC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));

syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(tA), TB, decltype(dB), decltype(sB), decltype(tB), TC, decltype(dC),
decltype(sC), decltype(tC), Alpha, Beta>>(dimGrid, dimBlock, prob_shape,
cta_tiler, A, dA, sA, tA, B, dB, sB, tB,
C, dC, sC, tC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -376,6 +380,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
aacostadiaz marked this conversation as resolved.
Show resolved Hide resolved
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_2_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -287,12 +289,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a TN GEMM
Expand Down Expand Up @@ -361,12 +364,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -397,6 +401,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_sm70_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -283,12 +285,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a TN GEMM
Expand Down Expand Up @@ -349,12 +352,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -385,6 +389,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_sm80_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -362,12 +364,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC,
alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a NT GEMM
Expand Down Expand Up @@ -433,12 +436,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC,
alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -468,6 +472,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
10 changes: 10 additions & 0 deletions examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/gemm/device/gemm.h"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp"
Expand Down Expand Up @@ -260,6 +262,14 @@ struct ExampleRunner {

initialize(problem_size);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

typename Gemm::GemmKernel::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
problem_size,
Expand Down
9 changes: 7 additions & 2 deletions include/cutlass/gemm/device/gemm_universal_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@
// 3.x
#include "cutlass/gemm/kernel/gemm_universal.hpp"

#if defined(CUTLASS_ENABLE_SYCL)
#include "cutlass/util/sycl_event_manager.hpp"
#endif

////////////////////////////////////////////////////////////////////////////////

namespace cutlass::gemm::device {
Expand Down Expand Up @@ -407,10 +411,11 @@ class GemmUniversalAdapter<
const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);

#if defined (SYCL_INTEL_TARGET)
syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
auto event = syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
#else
syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
auto event = syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
#endif
EventManager::getInstance().addEvent(event);
#else
device_kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params);
#endif
Expand Down
33 changes: 18 additions & 15 deletions tools/util/include/cutlass/util/GPU_Clock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,42 +32,49 @@
#pragma once

#if defined(CUTLASS_ENABLE_SYCL)
#include <syclcompat.hpp>
#include <chrono>
#include "cutlass/util/sycl_event_manager.hpp"
#else
#include <cuda_runtime.h>
#endif

struct GPU_Clock
{
#if !defined(CUTLASS_ENABLE_SYCL)
GPU_Clock() {
#if defined(CUTLASS_ENABLE_SYCL)
start_ = SyclEvent{};
stop_ = SyclEvent{};
#else
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
cudaEventRecord(start_);
#endif
}

~GPU_Clock() {
#if defined(CUTLASS_ENABLE_SYCL)
syclEventDestroy(start_);
syclEventDestroy(stop_);
#else
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
}
#endif
}

void start() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
start_ = std::chrono::high_resolution_clock::now();
syclEventRecord(start_);
#else
cudaEventRecord(start_);
#endif
}

float milliseconds() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<float, std::milli> time = stop - start_;
return time.count();
syclEventRecord(stop_);
syclEventSynchronize(start_, stop_);
float time;
syclEventElapsedTime(&time, start_, stop_);
return time;
#else
cudaEventRecord(stop_);
cudaEventSynchronize(stop_);
Expand All @@ -83,11 +90,7 @@ struct GPU_Clock

private:
#if defined(CUTLASS_ENABLE_SYCL)
typedef std::chrono::nanoseconds duration;
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::time_point<high_resolution_clock, duration> time_point;

time_point start_ = std::chrono::high_resolution_clock::now();
SyclEvent start_, stop_;
#else
cudaEvent_t start_, stop_;
#endif
Expand Down
Loading