From 47ee6182b824d45b8fb8628a940dfd4b0c37c0b9 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Sep 2023 15:30:46 +0200 Subject: [PATCH] Use nonblocking sync from CUDA.jl. --- Project.toml | 2 +- benchmarks/blas.jl | 6 ++---- benchmarks/runbenchmarks.jl | 6 ++++++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index 38540b9a..a8d0c7de 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,7 @@ LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [compat] -CUDA = "3.5, 4, 5" +CUDA = "5" ForwardDiff = "0.10" LLVM = "3, 4, 5, 6" LLVMLoopInfo = "1" diff --git a/benchmarks/blas.jl b/benchmarks/blas.jl index 01fa53e1..8cb5cb17 100644 --- a/benchmarks/blas.jl +++ b/benchmarks/blas.jl @@ -45,13 +45,11 @@ function blas_benchmark(group, a_type, b_type, cd_type, N, M=N, K=N; alpha=true, # NOTE: we use `cuStreamSynchronize` instead of `synchronize` to avoid # influence from the Julia scheduler - group[name] = @benchmarkable( + group[name] = @async_benchmarkable( begin GemmKernels.matmatmul!(c, $a_layout, $b_layout, a, b, $alpha, $beta; $(kwargs)...) - CUDA.cuStreamSynchronize(stream()) end, - setup=(a=CuArray($a_h); b=CuArray($b_h); c=CuArray($c_h); - CUDA.cuStreamSynchronize(stream())), + setup=(a=CuArray($a_h); b=CuArray($b_h); c=CuArray($c_h); synchronize()), teardown=(CUDA.unsafe_free!(a); CUDA.unsafe_free!(b); CUDA.unsafe_free!(c)) ) end diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl index d54f4d08..196f531c 100644 --- a/benchmarks/runbenchmarks.jl +++ b/benchmarks/runbenchmarks.jl @@ -8,6 +8,12 @@ using JSON using StableRNGs +# convenience macro to create a benchmark that requires synchronizing the GPU +macro async_benchmarkable(ex...) + quote + @benchmarkable CUDA.@sync blocking=true $(ex...) + end +end # we use setup/teardown phases to allocate/free GPU memory, # so make sure to run a couple of evaluations to amortize