Skip to content

Commit

Permalink
Re-introduce the 'blocking' kwargs to at-sync. (#2060)
Browse files Browse the repository at this point in the history
This can be used to force a blocking, but low-latency synchronization,
e.g., when benchmarking code that uses a single task.
  • Loading branch information
maleadt authored Aug 30, 2023
1 parent 0cb5659 commit d95ba8e
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 28 deletions.
38 changes: 19 additions & 19 deletions lib/cudadrv/synchronization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take)
# the synchronization, when it returns true (indicating that the object is synchronized)
# the actual synchronization API should be called again.

function fast_synchronization(f, obj)
function spinning_synchronization(f, obj)
# fast path
f(obj) && return true

Expand Down Expand Up @@ -164,9 +164,9 @@ function nonblocking_synchronize(val)
return
end

function device_synchronize()
if use_nonblocking_synchronization
if fast_synchronization(isdone, legacy_stream())
function device_synchronize(; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if spin && spinning_synchronization(isdone, legacy_stream())
cuCtxSynchronize()
else
nonblocking_synchronize(context())
Expand All @@ -178,9 +178,9 @@ function device_synchronize()
check_exceptions()
end

function synchronize(stream::CuStream=stream())
if use_nonblocking_synchronization
if fast_synchronization(isdone, stream)
function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if spin && spinning_synchronization(isdone, stream)
cuStreamSynchronize(stream)
else
nonblocking_synchronize(stream)
Expand All @@ -192,9 +192,9 @@ function synchronize(stream::CuStream=stream())
check_exceptions()
end

function synchronize(event::CuEvent)
if use_nonblocking_synchronization
if fast_synchronization(isdone, event)
function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if spin && spinning_synchronization(isdone, event)
cuEventSynchronize(event)
else
nonblocking_synchronize(event)
Expand Down Expand Up @@ -249,10 +249,10 @@ function nonblocking_synchronize(stream::CuStream)
return
end

function device_synchronize()
if use_nonblocking_synchronization
function device_synchronize(; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
stream = legacy_stream()
if !fast_synchronization(isdone, stream)
if !spin || !spinning_synchronization(isdone, stream)
nonblocking_synchronize(stream)
end
end
Expand All @@ -261,9 +261,9 @@ function device_synchronize()
check_exceptions()
end

function synchronize(stream::CuStream=stream())
if use_nonblocking_synchronization
if !fast_synchronization(isdone, stream)
function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if !spin || !spinning_synchronization(isdone, stream)
nonblocking_synchronize(stream)
end
end
Expand All @@ -272,9 +272,9 @@ function synchronize(stream::CuStream=stream())
check_exceptions()
end

function synchronize(event::CuEvent)
if use_nonblocking_synchronization
fast_synchronization(isdone, event)
function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
spin && spinning_synchronization(isdone, event)
end
cuEventSynchronize(event)
end
Expand Down
4 changes: 2 additions & 2 deletions perf/byval.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@ function main()
y1 = [similar(x1[1]) for i = 1:num_z_slices]

# reference down to bones add on GPU
results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])
results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])

# adding arrays in an array
for slices = 1:num_z_slices
results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
Expand Down
14 changes: 14 additions & 0 deletions perf/cuda.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
group = addgroup!(SUITE, "cuda")

let group = addgroup!(group, "synchronization")
let group = addgroup!(group, "stream")
group["blocking"] = @benchmarkable synchronize(blocking=true)
group["auto"] = @benchmarkable synchronize()
group["nonblocking"] = @benchmarkable synchronize(spin=false)
end
let group = addgroup!(group, "context")
group["blocking"] = @benchmarkable device_synchronize(blocking=true)
group["auto"] = @benchmarkable device_synchronize()
group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
end
end
2 changes: 1 addition & 1 deletion perf/cudadevrt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function main()
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
y1 = similar(x1)

results = @benchmark CUDA.@sync add!($y1, $x1, $x2)
results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
CUDA.unsafe_free!(x1)
Expand Down
3 changes: 2 additions & 1 deletion perf/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ end
# convenience macro to create a benchmark that requires synchronizing the GPU
macro async_benchmarkable(ex...)
quote
@benchmarkable CUDA.@sync $(ex...)
@benchmarkable CUDA.@sync blocking=true $(ex...)
end
end

Expand All @@ -30,6 +30,7 @@ SUITE = BenchmarkGroup()

# NOTE: don't use spaces in benchmark names (tobami/codespeed#256)

include("cuda.jl")
include("kernel.jl")
include("array.jl")

Expand Down
4 changes: 2 additions & 2 deletions perf/volumerhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ function main()
$(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
$(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
results = @benchmark begin
CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
threads=$threads, blocks=$nelem)
CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
threads=$threads, blocks=$nelem)
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
Expand Down
16 changes: 13 additions & 3 deletions src/utilities.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
"""
@sync ex
@sync [blocking=false] ex
Run expression `ex` and synchronize the GPU afterwards.
The `blocking` keyword argument determines how synchronization is performed. By default,
non-blocking synchronization will be used, which gives other Julia tasks a chance to run
while waiting for the GPU to finish. This may increase latency, so for short operations,
or when benchmaring code that does not use multiple tasks, it may be beneficial to use
blocking synchronization instead by setting `blocking=true`. Blocking synchronization
can also be enabled globally by changing the `nonblocking_synchronization` preference.
See also: [`synchronize`](@ref).
"""
macro sync(ex...)
Expand All @@ -11,19 +18,22 @@ macro sync(ex...)
kwargs = ex[1:end-1]

# decode keyword arguments
blocking = false
for kwarg in kwargs
Meta.isexpr(kwarg, :(=)) || error("Invalid keyword argument $kwarg")
key, val = kwarg.args
if key == :blocking
Base.depwarn("the blocking keyword to @sync has been deprecated", :sync)
isa(val, Bool) ||
error("Invalid value for keyword argument $kwarg; expected Bool, got $(val)")
blocking = val
else
error("Unknown keyword argument $kwarg")
end
end

quote
local ret = $(esc(code))
synchronize()
synchronize(; blocking=$blocking)
ret
end
end
Expand Down
2 changes: 2 additions & 0 deletions test/core/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ end
end
@test t >= 0
@test ret == 42

CUDA.@sync blocking=true identity(nothing)
end

@testset "versioninfo" begin
Expand Down

0 comments on commit d95ba8e

Please sign in to comment.