diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index 3d73ab63fb..decedbf75a 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -29,6 +29,10 @@ * **************************************************************************************************/ +#if defined(CUTLASS_ENABLE_SYCL) +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED +#endif + #include "cutlass/gemm/device/gemm.h" #include "cutlass/epilogue/collective/default_epilogue.hpp" #include "cutlass/gemm/device/gemm_universal.h" @@ -208,7 +212,7 @@ struct BenchmarkRunner { auto problem_shape_MNKL = cute::append<4>(problem_size, 1); auto [M, N, K, L] = problem_shape_MNKL; -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED) sycl::property_list prop = { sycl::property::queue::in_order(), sycl::property::queue::enable_profiling() diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp index 4bea284d8f..8858a3c2e3 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include #include diff --git a/examples/cute/tutorial/sgemm_1_sycl.cpp b/examples/cute/tutorial/sgemm_1_sycl.cpp index ab4aa38538..2a588af0ef 100644 --- a/examples/cute/tutorial/sgemm_1_sycl.cpp +++ b/examples/cute/tutorial/sgemm_1_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -295,12 +297,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(tC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>(dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, tA, B, dB, sB, tB, C, dC, sC, tC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a TN GEMM @@ -341,12 +344,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(tC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>(dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, tA, B, dB, sB, tB, C, dC, sC, tC, alpha, beta); + EventManager::getInstance().addEvent(event); } template diff --git a/examples/cute/tutorial/sgemm_2_sycl.cpp b/examples/cute/tutorial/sgemm_2_sycl.cpp index 3662011bbb..831bfe0881 100644 --- a/examples/cute/tutorial/sgemm_2_sycl.cpp +++ b/examples/cute/tutorial/sgemm_2_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -287,12 +289,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a TN GEMM @@ -361,12 +364,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } template diff --git a/examples/cute/tutorial/sgemm_sm70_sycl.cpp b/examples/cute/tutorial/sgemm_sm70_sycl.cpp index 18e1cbf2e2..28e3b2948b 100644 --- a/examples/cute/tutorial/sgemm_sm70_sycl.cpp +++ b/examples/cute/tutorial/sgemm_sm70_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -283,12 +285,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a TN GEMM @@ -349,12 +352,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } template diff --git a/examples/cute/tutorial/sgemm_sm80_sycl.cpp b/examples/cute/tutorial/sgemm_sm80_sycl.cpp index f01871d9a5..d59cf3ee57 100644 --- a/examples/cute/tutorial/sgemm_sm80_sycl.cpp +++ b/examples/cute/tutorial/sgemm_sm80_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -362,12 +364,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a NT GEMM @@ -433,12 +436,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } template diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp index 204214da11..f19c7b2165 100644 --- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/gemm/device/gemm.h" #include "cutlass/epilogue/collective/default_epilogue.hpp" #include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp" @@ -260,6 +262,14 @@ struct ExampleRunner { initialize(problem_size); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + typename Gemm::GemmKernel::Arguments arguments{ cutlass::gemm::GemmUniversalMode::kGemm, problem_size, diff --git a/tools/util/include/cutlass/util/sycl_event_manager.hpp b/tools/util/include/cutlass/util/sycl_event_manager.hpp index 9d06ddf5aa..4b2032adfc 100644 --- a/tools/util/include/cutlass/util/sycl_event_manager.hpp +++ b/tools/util/include/cutlass/util/sycl_event_manager.hpp @@ -116,18 +116,20 @@ class EventManager { }; -void syclEventDestroy(SyclEvent const& event) { +inline void syclEventDestroy(SyclEvent const& event) { EventManager::getInstance().eventDestroy(); } -void syclEventRecord(SyclEvent &event) { +inline void syclEventRecord(SyclEvent &event) { EventManager::getInstance().startRecording(event); } -void syclEventSynchronize(SyclEvent const& begin, SyclEvent const& end) { +inline void syclEventSynchronize(SyclEvent const& begin, SyclEvent const& end) { EventManager::getInstance().wait(begin, end); } -void syclEventElapsedTime(float* time, SyclEvent const& begin, SyclEvent const& end) { +inline void syclEventElapsedTime(float* time, SyclEvent const& begin, SyclEvent const& end) { +#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED) *time = EventManager::getInstance().getEventElapsedTimeMs(begin, end); +#endif }