From 47ee6182b824d45b8fb8628a940dfd4b0c37c0b9 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Sep 2023 15:30:46 +0200
Subject: [PATCH] Use nonblocking sync from CUDA.jl.

---
 Project.toml                | 2 +-
 benchmarks/blas.jl          | 6 ++----
 benchmarks/runbenchmarks.jl | 6 ++++++
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/Project.toml b/Project.toml
index 38540b9a..a8d0c7de 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,7 @@ LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [compat]
-CUDA = "3.5, 4, 5"
+CUDA = "5"
 ForwardDiff = "0.10"
 LLVM = "3, 4, 5, 6"
 LLVMLoopInfo = "1"
diff --git a/benchmarks/blas.jl b/benchmarks/blas.jl
index 01fa53e1..8cb5cb17 100644
--- a/benchmarks/blas.jl
+++ b/benchmarks/blas.jl
@@ -45,13 +45,11 @@ function blas_benchmark(group, a_type, b_type, cd_type, N, M=N, K=N; alpha=true,
 
     # NOTE: we use `cuStreamSynchronize` instead of `synchronize` to avoid
     #       influence from the Julia scheduler
-    group[name] = @benchmarkable(
+    group[name] = @async_benchmarkable(
         begin
             GemmKernels.matmatmul!(c, $a_layout, $b_layout, a, b, $alpha, $beta; $(kwargs)...)
-            CUDA.cuStreamSynchronize(stream())
         end,
-        setup=(a=CuArray($a_h); b=CuArray($b_h); c=CuArray($c_h);
-               CUDA.cuStreamSynchronize(stream())),
+        setup=(a=CuArray($a_h); b=CuArray($b_h); c=CuArray($c_h); synchronize()),
         teardown=(CUDA.unsafe_free!(a); CUDA.unsafe_free!(b); CUDA.unsafe_free!(c))
     )
 end
diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl
index d54f4d08..196f531c 100644
--- a/benchmarks/runbenchmarks.jl
+++ b/benchmarks/runbenchmarks.jl
@@ -8,6 +8,12 @@ using JSON
 
 using StableRNGs
 
+# convenience macro to create a benchmark that requires synchronizing the GPU
+macro async_benchmarkable(ex...)
+    quote
+        @benchmarkable CUDA.@sync blocking=true $(ex...)
+    end
+end
 
 # we use setup/teardown phases to allocate/free GPU memory,
 # so make sure to run a couple of evaluations to amortize