Re-introduce the 'blocking' kwargs to at-sync. (#2060)

This can be used to force a blocking, but low-latency synchronization, e.g., when benchmarking code that uses a single task.
JuliaGPU · Aug 30, 2023 · d95ba8e · d95ba8e
1 parent 0cb5659
commit d95ba8e
Show file tree

Hide file tree

Showing 8 changed files with 55 additions and 28 deletions.
diff --git a/lib/cudadrv/synchronization.jl b/lib/cudadrv/synchronization.jl
@@ -74,7 +74,7 @@ Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take)
 # the synchronization, when it returns true (indicating that the object is synchronized)
 # the actual synchronization API should be called again.
 
-function fast_synchronization(f, obj)
+function spinning_synchronization(f, obj)
     # fast path
     f(obj) && return true
 
@@ -164,9 +164,9 @@ function nonblocking_synchronize(val)
     return
 end
 
-function device_synchronize()
-    if use_nonblocking_synchronization
-        if fast_synchronization(isdone, legacy_stream())
+function device_synchronize(; blocking::Bool=false, spin::Bool=true)
+    if use_nonblocking_synchronization && !blocking
+        if spin && spinning_synchronization(isdone, legacy_stream())
             cuCtxSynchronize()
         else
             nonblocking_synchronize(context())
@@ -178,9 +178,9 @@ function device_synchronize()
     check_exceptions()
 end
 
-function synchronize(stream::CuStream=stream())
-    if use_nonblocking_synchronization
-        if fast_synchronization(isdone, stream)
+function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
+    if use_nonblocking_synchronization && !blocking
+        if spin && spinning_synchronization(isdone, stream)
             cuStreamSynchronize(stream)
         else
             nonblocking_synchronize(stream)
@@ -192,9 +192,9 @@ function synchronize(stream::CuStream=stream())
     check_exceptions()
 end
 
-function synchronize(event::CuEvent)
-    if use_nonblocking_synchronization
-        if fast_synchronization(isdone, event)
+function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
+    if use_nonblocking_synchronization && !blocking
+        if spin && spinning_synchronization(isdone, event)
             cuEventSynchronize(event)
         else
             nonblocking_synchronize(event)
@@ -249,10 +249,10 @@ function nonblocking_synchronize(stream::CuStream)
     return
 end
 
-function device_synchronize()
-    if use_nonblocking_synchronization
+function device_synchronize(; blocking::Bool=false, spin::Bool=true)
+    if use_nonblocking_synchronization && !blocking
         stream = legacy_stream()
-        if !fast_synchronization(isdone, stream)
+        if !spin || !spinning_synchronization(isdone, stream)
             nonblocking_synchronize(stream)
         end
     end
@@ -261,9 +261,9 @@ function device_synchronize()
     check_exceptions()
 end
 
-function synchronize(stream::CuStream=stream())
-    if use_nonblocking_synchronization
-        if !fast_synchronization(isdone, stream)
+function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
+    if use_nonblocking_synchronization && !blocking
+        if !spin || !spinning_synchronization(isdone, stream)
             nonblocking_synchronize(stream)
         end
     end
@@ -272,9 +272,9 @@ function synchronize(stream::CuStream=stream())
     check_exceptions()
 end
 
-function synchronize(event::CuEvent)
-    if use_nonblocking_synchronization
-        fast_synchronization(isdone, event)
+function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
+    if use_nonblocking_synchronization && !blocking
+        spin && spinning_synchronization(isdone, event)
     end
     cuEventSynchronize(event)
 end

diff --git a/perf/byval.jl b/perf/byval.jl
@@ -59,11 +59,11 @@ function main()
     y1 = [similar(x1[1]) for i = 1:num_z_slices]
 
     # reference down to bones add on GPU
-    results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])
+    results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
 
     # adding arrays in an array
     for slices = 1:num_z_slices
-        results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+        results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them

diff --git a/perf/cuda.jl b/perf/cuda.jl
@@ -0,0 +1,14 @@
+group = addgroup!(SUITE, "cuda")
+
+let group = addgroup!(group, "synchronization")
+    let group = addgroup!(group, "stream")
+        group["blocking"] = @benchmarkable synchronize(blocking=true)
+        group["auto"] = @benchmarkable synchronize()
+        group["nonblocking"] = @benchmarkable synchronize(spin=false)
+    end
+    let group = addgroup!(group, "context")
+        group["blocking"] = @benchmarkable device_synchronize(blocking=true)
+        group["auto"] = @benchmarkable device_synchronize()
+        group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
+    end
+end
diff --git a/perf/cudadevrt.jl b/perf/cudadevrt.jl
@@ -26,7 +26,7 @@ function main()
     x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
     y1 = similar(x1)
 
-    results = @benchmark CUDA.@sync add!($y1, $x1, $x2)
+    results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
     CUDA.unsafe_free!(x1)

diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
@@ -17,7 +17,7 @@ end
 # convenience macro to create a benchmark that requires synchronizing the GPU
 macro async_benchmarkable(ex...)
     quote
-        @benchmarkable CUDA.@sync $(ex...)
+        @benchmarkable CUDA.@sync blocking=true $(ex...)
     end
 end
 
@@ -30,6 +30,7 @@ SUITE = BenchmarkGroup()
 
 # NOTE: don't use spaces in benchmark names (tobami/codespeed#256)
 
+include("cuda.jl")
 include("kernel.jl")
 include("array.jl")
 

diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
@@ -255,8 +255,8 @@ function main()
                 $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
                 $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
     results = @benchmark begin
-        CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
-                           threads=$threads, blocks=$nelem)
+        CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
+                                         threads=$threads, blocks=$nelem)
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them

diff --git a/src/utilities.jl b/src/utilities.jl
@@ -1,8 +1,15 @@
 """
-    @sync ex
+    @sync [blocking=false] ex
 
 Run expression `ex` and synchronize the GPU afterwards.
 
+The `blocking` keyword argument determines how synchronization is performed. By default,
+non-blocking synchronization will be used, which gives other Julia tasks a chance to run
+while waiting for the GPU to finish. This may increase latency, so for short operations,
+or when benchmaring code that does not use multiple tasks, it may be beneficial to use
+blocking synchronization instead by setting `blocking=true`. Blocking synchronization
+can also be enabled globally by changing the `nonblocking_synchronization` preference.
+
 See also: [`synchronize`](@ref).
 """
 macro sync(ex...)
@@ -11,19 +18,22 @@ macro sync(ex...)
     kwargs = ex[1:end-1]
 
     # decode keyword arguments
+    blocking = false
     for kwarg in kwargs
         Meta.isexpr(kwarg, :(=)) || error("Invalid keyword argument $kwarg")
         key, val = kwarg.args
         if key == :blocking
-            Base.depwarn("the blocking keyword to @sync has been deprecated", :sync)
+            isa(val, Bool) ||
+                error("Invalid value for keyword argument $kwarg; expected Bool, got $(val)")
+            blocking = val
         else
             error("Unknown keyword argument $kwarg")
         end
     end
 
     quote
         local ret = $(esc(code))
-        synchronize()
+        synchronize(; blocking=$blocking)
         ret
     end
 end

diff --git a/test/core/utils.jl b/test/core/utils.jl
@@ -29,6 +29,8 @@ end
   end
   @test t >= 0
   @test ret == 42
+
+  CUDA.@sync blocking=true identity(nothing)
 end
 
 @testset "versioninfo" begin