Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-introduce the 'blocking' kwargs to at-sync. #2060

Merged
merged 2 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions lib/cudadrv/synchronization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take)
# the synchronization, when it returns true (indicating that the object is synchronized)
# the actual synchronization API should be called again.

function fast_synchronization(f, obj)
function spinning_synchronization(f, obj)
# fast path
f(obj) && return true

Expand Down Expand Up @@ -164,9 +164,9 @@ function nonblocking_synchronize(val)
return
end

function device_synchronize()
if use_nonblocking_synchronization
if fast_synchronization(isdone, legacy_stream())
function device_synchronize(; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if spin && spinning_synchronization(isdone, legacy_stream())
cuCtxSynchronize()
else
nonblocking_synchronize(context())
Expand All @@ -178,9 +178,9 @@ function device_synchronize()
check_exceptions()
end

function synchronize(stream::CuStream=stream())
if use_nonblocking_synchronization
if fast_synchronization(isdone, stream)
function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if spin && spinning_synchronization(isdone, stream)
cuStreamSynchronize(stream)
else
nonblocking_synchronize(stream)
Expand All @@ -192,9 +192,9 @@ function synchronize(stream::CuStream=stream())
check_exceptions()
end

function synchronize(event::CuEvent)
if use_nonblocking_synchronization
if fast_synchronization(isdone, event)
function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if spin && spinning_synchronization(isdone, event)
cuEventSynchronize(event)
else
nonblocking_synchronize(event)
Expand Down Expand Up @@ -249,10 +249,10 @@ function nonblocking_synchronize(stream::CuStream)
return
end

function device_synchronize()
if use_nonblocking_synchronization
function device_synchronize(; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
stream = legacy_stream()
if !fast_synchronization(isdone, stream)
if !spin || !spinning_synchronization(isdone, stream)
nonblocking_synchronize(stream)
end
end
Expand All @@ -261,9 +261,9 @@ function device_synchronize()
check_exceptions()
end

function synchronize(stream::CuStream=stream())
if use_nonblocking_synchronization
if !fast_synchronization(isdone, stream)
function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
if !spin || !spinning_synchronization(isdone, stream)
nonblocking_synchronize(stream)
end
end
Expand All @@ -272,9 +272,9 @@ function synchronize(stream::CuStream=stream())
check_exceptions()
end

function synchronize(event::CuEvent)
if use_nonblocking_synchronization
fast_synchronization(isdone, event)
function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
if use_nonblocking_synchronization && !blocking
spin && spinning_synchronization(isdone, event)
end
cuEventSynchronize(event)
end
Expand Down
4 changes: 2 additions & 2 deletions perf/byval.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@ function main()
y1 = [similar(x1[1]) for i = 1:num_z_slices]

# reference down to bones add on GPU
results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])
results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])

# adding arrays in an array
for slices = 1:num_z_slices
results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
Expand Down
14 changes: 14 additions & 0 deletions perf/cuda.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
group = addgroup!(SUITE, "cuda")

let group = addgroup!(group, "synchronization")
let group = addgroup!(group, "stream")
group["blocking"] = @benchmarkable synchronize(blocking=true)
group["auto"] = @benchmarkable synchronize()
group["nonblocking"] = @benchmarkable synchronize(spin=false)
end
let group = addgroup!(group, "context")
group["blocking"] = @benchmarkable device_synchronize(blocking=true)
group["auto"] = @benchmarkable device_synchronize()
group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
end
end
2 changes: 1 addition & 1 deletion perf/cudadevrt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function main()
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
y1 = similar(x1)

results = @benchmark CUDA.@sync add!($y1, $x1, $x2)
results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
CUDA.unsafe_free!(x1)
Expand Down
3 changes: 2 additions & 1 deletion perf/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ end
# convenience macro to create a benchmark that requires synchronizing the GPU
macro async_benchmarkable(ex...)
quote
@benchmarkable CUDA.@sync $(ex...)
@benchmarkable CUDA.@sync blocking=true $(ex...)
end
end

Expand All @@ -30,6 +30,7 @@ SUITE = BenchmarkGroup()

# NOTE: don't use spaces in benchmark names (tobami/codespeed#256)

include("cuda.jl")
include("kernel.jl")
include("array.jl")

Expand Down
4 changes: 2 additions & 2 deletions perf/volumerhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ function main()
$(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
$(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
results = @benchmark begin
CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
threads=$threads, blocks=$nelem)
CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
threads=$threads, blocks=$nelem)
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
Expand Down
16 changes: 13 additions & 3 deletions src/utilities.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
"""
@sync ex
@sync [blocking=false] ex

Run expression `ex` and synchronize the GPU afterwards.

The `blocking` keyword argument determines how synchronization is performed. By default,
non-blocking synchronization will be used, which gives other Julia tasks a chance to run
while waiting for the GPU to finish. This may increase latency, so for short operations,
or when benchmaring code that does not use multiple tasks, it may be beneficial to use
blocking synchronization instead by setting `blocking=true`. Blocking synchronization
can also be enabled globally by changing the `nonblocking_synchronization` preference.

See also: [`synchronize`](@ref).
"""
macro sync(ex...)
Expand All @@ -11,19 +18,22 @@ macro sync(ex...)
kwargs = ex[1:end-1]

# decode keyword arguments
blocking = false
for kwarg in kwargs
Meta.isexpr(kwarg, :(=)) || error("Invalid keyword argument $kwarg")
key, val = kwarg.args
if key == :blocking
Base.depwarn("the blocking keyword to @sync has been deprecated", :sync)
isa(val, Bool) ||
error("Invalid value for keyword argument $kwarg; expected Bool, got $(val)")
blocking = val
else
error("Unknown keyword argument $kwarg")
end
end

quote
local ret = $(esc(code))
synchronize()
synchronize(; blocking=$blocking)
ret
end
end
Expand Down
2 changes: 2 additions & 0 deletions test/core/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ end
end
@test t >= 0
@test ret == 42

CUDA.@sync blocking=true identity(nothing)
end

@testset "versioninfo" begin
Expand Down