From 49da401f8d087086451a472c43f7fe03b7deafff Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 25 Aug 2023 15:21:38 +0200 Subject: [PATCH 1/2] Re-introduce the 'blocking' kwargs to at-sync. This can be used to force a blocking, but low-latency synchronization, e.g., when benchmarking code that uses a single task. --- lib/cudadrv/synchronization.jl | 24 ++++++++++++------------ perf/byval.jl | 4 ++-- perf/cudadevrt.jl | 2 +- perf/runbenchmarks.jl | 2 +- perf/volumerhs.jl | 4 ++-- src/utilities.jl | 16 +++++++++++++--- test/core/utils.jl | 2 ++ 7 files changed, 33 insertions(+), 21 deletions(-) diff --git a/lib/cudadrv/synchronization.jl b/lib/cudadrv/synchronization.jl index d512650511..31c6fe38cb 100644 --- a/lib/cudadrv/synchronization.jl +++ b/lib/cudadrv/synchronization.jl @@ -164,8 +164,8 @@ function nonblocking_synchronize(val) return end -function device_synchronize() - if use_nonblocking_synchronization +function device_synchronize(; blocking::Bool=false) + if use_nonblocking_synchronization && !blocking if fast_synchronization(isdone, legacy_stream()) cuCtxSynchronize() else @@ -178,8 +178,8 @@ function device_synchronize() check_exceptions() end -function synchronize(stream::CuStream=stream()) - if use_nonblocking_synchronization +function synchronize(stream::CuStream=stream(); blocking::Bool=false) + if use_nonblocking_synchronization && !blocking if fast_synchronization(isdone, stream) cuStreamSynchronize(stream) else @@ -192,8 +192,8 @@ function synchronize(stream::CuStream=stream()) check_exceptions() end -function synchronize(event::CuEvent) - if use_nonblocking_synchronization +function synchronize(event::CuEvent; blocking::Bool=false) + if use_nonblocking_synchronization && !blocking if fast_synchronization(isdone, event) cuEventSynchronize(event) else @@ -249,8 +249,8 @@ function nonblocking_synchronize(stream::CuStream) return end -function device_synchronize() - if use_nonblocking_synchronization +function device_synchronize(; blocking::Bool=false) + if use_nonblocking_synchronization && !blocking stream = legacy_stream() if !fast_synchronization(isdone, stream) nonblocking_synchronize(stream) @@ -261,8 +261,8 @@ function device_synchronize() check_exceptions() end -function synchronize(stream::CuStream=stream()) - if use_nonblocking_synchronization +function synchronize(stream::CuStream=stream(); blocking::Bool=false) + if use_nonblocking_synchronization && !blocking if !fast_synchronization(isdone, stream) nonblocking_synchronize(stream) end @@ -272,8 +272,8 @@ function synchronize(stream::CuStream=stream()) check_exceptions() end -function synchronize(event::CuEvent) - if use_nonblocking_synchronization +function synchronize(event::CuEvent; blocking::Bool=false) + if use_nonblocking_synchronization && !blocking fast_synchronization(isdone, event) end cuEventSynchronize(event) diff --git a/perf/byval.jl b/perf/byval.jl index 2ad777319e..d32d62a9c6 100644 --- a/perf/byval.jl +++ b/perf/byval.jl @@ -59,11 +59,11 @@ function main() y1 = [similar(x1[1]) for i = 1:num_z_slices] # reference down to bones add on GPU - results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1]) + results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1]) # adding arrays in an array for slices = 1:num_z_slices - results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) + results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) end # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them diff --git a/perf/cudadevrt.jl b/perf/cudadevrt.jl index f166dd373a..08348fab5a 100644 --- a/perf/cudadevrt.jl +++ b/perf/cudadevrt.jl @@ -26,7 +26,7 @@ function main() x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5)) y1 = similar(x1) - results = @benchmark CUDA.@sync add!($y1, $x1, $x2) + results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2) # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them CUDA.unsafe_free!(x1) diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 9e8ef68608..1893c55f14 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -17,7 +17,7 @@ end # convenience macro to create a benchmark that requires synchronizing the GPU macro async_benchmarkable(ex...) quote - @benchmarkable CUDA.@sync $(ex...) + @benchmarkable CUDA.@sync blocking=true $(ex...) end end diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl index 25841bb95f..5c7737f578 100644 --- a/perf/volumerhs.jl +++ b/perf/volumerhs.jl @@ -255,8 +255,8 @@ function main() $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" results = @benchmark begin - CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; - threads=$threads, blocks=$nelem) + CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; + threads=$threads, blocks=$nelem) end # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them diff --git a/src/utilities.jl b/src/utilities.jl index 1cce3bb07f..808694f565 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -1,8 +1,15 @@ """ - @sync ex + @sync [blocking=false] ex Run expression `ex` and synchronize the GPU afterwards. +The `blocking` keyword argument determines how synchronization is performed. By default, +non-blocking synchronization will be used, which gives other Julia tasks a chance to run +while waiting for the GPU to finish. This may increase latency, so for short operations, +or when benchmaring code that does not use multiple tasks, it may be beneficial to use +blocking synchronization instead by setting `blocking=true`. Blocking synchronization +can also be enabled globally by changing the `nonblocking_synchronization` preference. + See also: [`synchronize`](@ref). """ macro sync(ex...) @@ -11,11 +18,14 @@ macro sync(ex...) kwargs = ex[1:end-1] # decode keyword arguments + blocking = false for kwarg in kwargs Meta.isexpr(kwarg, :(=)) || error("Invalid keyword argument $kwarg") key, val = kwarg.args if key == :blocking - Base.depwarn("the blocking keyword to @sync has been deprecated", :sync) + isa(val, Bool) || + error("Invalid value for keyword argument $kwarg; expected Bool, got $(val)") + blocking = val else error("Unknown keyword argument $kwarg") end @@ -23,7 +33,7 @@ macro sync(ex...) quote local ret = $(esc(code)) - synchronize() + synchronize(; blocking=$blocking) ret end end diff --git a/test/core/utils.jl b/test/core/utils.jl index 8793fd4d45..ae57edbb7b 100644 --- a/test/core/utils.jl +++ b/test/core/utils.jl @@ -29,6 +29,8 @@ end end @test t >= 0 @test ret == 42 + + CUDA.@sync blocking=true identity(nothing) end @testset "versioninfo" begin From cf60dae85262ef8a0114dc29ab40909e5f669557 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 30 Aug 2023 12:37:07 +0200 Subject: [PATCH 2/2] Add synchronization benchmarks. [skip tests] --- lib/cudadrv/synchronization.jl | 26 +++++++++++++------------- perf/cuda.jl | 14 ++++++++++++++ perf/runbenchmarks.jl | 1 + 3 files changed, 28 insertions(+), 13 deletions(-) create mode 100644 perf/cuda.jl diff --git a/lib/cudadrv/synchronization.jl b/lib/cudadrv/synchronization.jl index 31c6fe38cb..8e200437d7 100644 --- a/lib/cudadrv/synchronization.jl +++ b/lib/cudadrv/synchronization.jl @@ -74,7 +74,7 @@ Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take) # the synchronization, when it returns true (indicating that the object is synchronized) # the actual synchronization API should be called again. -function fast_synchronization(f, obj) +function spinning_synchronization(f, obj) # fast path f(obj) && return true @@ -164,9 +164,9 @@ function nonblocking_synchronize(val) return end -function device_synchronize(; blocking::Bool=false) +function device_synchronize(; blocking::Bool=false, spin::Bool=true) if use_nonblocking_synchronization && !blocking - if fast_synchronization(isdone, legacy_stream()) + if spin && spinning_synchronization(isdone, legacy_stream()) cuCtxSynchronize() else nonblocking_synchronize(context()) @@ -178,9 +178,9 @@ function device_synchronize(; blocking::Bool=false) check_exceptions() end -function synchronize(stream::CuStream=stream(); blocking::Bool=false) +function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true) if use_nonblocking_synchronization && !blocking - if fast_synchronization(isdone, stream) + if spin && spinning_synchronization(isdone, stream) cuStreamSynchronize(stream) else nonblocking_synchronize(stream) @@ -192,9 +192,9 @@ function synchronize(stream::CuStream=stream(); blocking::Bool=false) check_exceptions() end -function synchronize(event::CuEvent; blocking::Bool=false) +function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true) if use_nonblocking_synchronization && !blocking - if fast_synchronization(isdone, event) + if spin && spinning_synchronization(isdone, event) cuEventSynchronize(event) else nonblocking_synchronize(event) @@ -249,10 +249,10 @@ function nonblocking_synchronize(stream::CuStream) return end -function device_synchronize(; blocking::Bool=false) +function device_synchronize(; blocking::Bool=false, spin::Bool=true) if use_nonblocking_synchronization && !blocking stream = legacy_stream() - if !fast_synchronization(isdone, stream) + if !spin || !spinning_synchronization(isdone, stream) nonblocking_synchronize(stream) end end @@ -261,9 +261,9 @@ function device_synchronize(; blocking::Bool=false) check_exceptions() end -function synchronize(stream::CuStream=stream(); blocking::Bool=false) +function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true) if use_nonblocking_synchronization && !blocking - if !fast_synchronization(isdone, stream) + if !spin || !spinning_synchronization(isdone, stream) nonblocking_synchronize(stream) end end @@ -272,9 +272,9 @@ function synchronize(stream::CuStream=stream(); blocking::Bool=false) check_exceptions() end -function synchronize(event::CuEvent; blocking::Bool=false) +function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true) if use_nonblocking_synchronization && !blocking - fast_synchronization(isdone, event) + spin && spinning_synchronization(isdone, event) end cuEventSynchronize(event) end diff --git a/perf/cuda.jl b/perf/cuda.jl new file mode 100644 index 0000000000..5dbcbf0abc --- /dev/null +++ b/perf/cuda.jl @@ -0,0 +1,14 @@ +group = addgroup!(SUITE, "cuda") + +let group = addgroup!(group, "synchronization") + let group = addgroup!(group, "stream") + group["blocking"] = @benchmarkable synchronize(blocking=true) + group["auto"] = @benchmarkable synchronize() + group["nonblocking"] = @benchmarkable synchronize(spin=false) + end + let group = addgroup!(group, "context") + group["blocking"] = @benchmarkable device_synchronize(blocking=true) + group["auto"] = @benchmarkable device_synchronize() + group["nonblocking"] = @benchmarkable device_synchronize(spin=false) + end +end diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 1893c55f14..5969dcd9b7 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -30,6 +30,7 @@ SUITE = BenchmarkGroup() # NOTE: don't use spaces in benchmark names (tobami/codespeed#256) +include("cuda.jl") include("kernel.jl") include("array.jl")