Use nonblocking sync from CUDA.jl.

JuliaGPU · Sep 20, 2023 · 47ee618 · 47ee618
1 parent 70db3b6
commit 47ee618
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 5 deletions.
diff --git a/Project.toml b/Project.toml
@@ -11,7 +11,7 @@ LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [compat]
-CUDA = "3.5, 4, 5"
+CUDA = "5"
 ForwardDiff = "0.10"
 LLVM = "3, 4, 5, 6"
 LLVMLoopInfo = "1"

diff --git a/benchmarks/blas.jl b/benchmarks/blas.jl
@@ -45,13 +45,11 @@ function blas_benchmark(group, a_type, b_type, cd_type, N, M=N, K=N; alpha=true,
 
     # NOTE: we use `cuStreamSynchronize` instead of `synchronize` to avoid
     #       influence from the Julia scheduler
-    group[name] = @benchmarkable(
+    group[name] = @async_benchmarkable(
         begin
             GemmKernels.matmatmul!(c, $a_layout, $b_layout, a, b, $alpha, $beta; $(kwargs)...)
-            CUDA.cuStreamSynchronize(stream())
         end,
-        setup=(a=CuArray($a_h); b=CuArray($b_h); c=CuArray($c_h);
-               CUDA.cuStreamSynchronize(stream())),
+        setup=(a=CuArray($a_h); b=CuArray($b_h); c=CuArray($c_h); synchronize()),
         teardown=(CUDA.unsafe_free!(a); CUDA.unsafe_free!(b); CUDA.unsafe_free!(c))
     )
 end

diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl
@@ -8,6 +8,12 @@ using JSON
 
 using StableRNGs
 
+# convenience macro to create a benchmark that requires synchronizing the GPU
+macro async_benchmarkable(ex...)
+    quote
+        @benchmarkable CUDA.@sync blocking=true $(ex...)
+    end
+end
 
 # we use setup/teardown phases to allocate/free GPU memory,
 # so make sure to run a couple of evaluations to amortize