From 49da401f8d087086451a472c43f7fe03b7deafff Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 25 Aug 2023 15:21:38 +0200
Subject: [PATCH 1/2] Re-introduce the 'blocking' kwargs to at-sync.

This can be used to force a blocking, but low-latency synchronization,
e.g., when benchmarking code that uses a single task.
---
 lib/cudadrv/synchronization.jl | 24 ++++++++++++------------
 perf/byval.jl                  |  4 ++--
 perf/cudadevrt.jl              |  2 +-
 perf/runbenchmarks.jl          |  2 +-
 perf/volumerhs.jl              |  4 ++--
 src/utilities.jl               | 16 +++++++++++++---
 test/core/utils.jl             |  2 ++
 7 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/lib/cudadrv/synchronization.jl b/lib/cudadrv/synchronization.jl
index d512650511..31c6fe38cb 100644
--- a/lib/cudadrv/synchronization.jl
+++ b/lib/cudadrv/synchronization.jl
@@ -164,8 +164,8 @@ function nonblocking_synchronize(val)
     return
 end
 
-function device_synchronize()
-    if use_nonblocking_synchronization
+function device_synchronize(; blocking::Bool=false)
+    if use_nonblocking_synchronization && !blocking
         if fast_synchronization(isdone, legacy_stream())
             cuCtxSynchronize()
         else
@@ -178,8 +178,8 @@ function device_synchronize()
     check_exceptions()
 end
 
-function synchronize(stream::CuStream=stream())
-    if use_nonblocking_synchronization
+function synchronize(stream::CuStream=stream(); blocking::Bool=false)
+    if use_nonblocking_synchronization && !blocking
         if fast_synchronization(isdone, stream)
             cuStreamSynchronize(stream)
         else
@@ -192,8 +192,8 @@ function synchronize(stream::CuStream=stream())
     check_exceptions()
 end
 
-function synchronize(event::CuEvent)
-    if use_nonblocking_synchronization
+function synchronize(event::CuEvent; blocking::Bool=false)
+    if use_nonblocking_synchronization && !blocking
         if fast_synchronization(isdone, event)
             cuEventSynchronize(event)
         else
@@ -249,8 +249,8 @@ function nonblocking_synchronize(stream::CuStream)
     return
 end
 
-function device_synchronize()
-    if use_nonblocking_synchronization
+function device_synchronize(; blocking::Bool=false)
+    if use_nonblocking_synchronization && !blocking
         stream = legacy_stream()
         if !fast_synchronization(isdone, stream)
             nonblocking_synchronize(stream)
@@ -261,8 +261,8 @@ function device_synchronize()
     check_exceptions()
 end
 
-function synchronize(stream::CuStream=stream())
-    if use_nonblocking_synchronization
+function synchronize(stream::CuStream=stream(); blocking::Bool=false)
+    if use_nonblocking_synchronization && !blocking
         if !fast_synchronization(isdone, stream)
             nonblocking_synchronize(stream)
         end
@@ -272,8 +272,8 @@ function synchronize(stream::CuStream=stream())
     check_exceptions()
 end
 
-function synchronize(event::CuEvent)
-    if use_nonblocking_synchronization
+function synchronize(event::CuEvent; blocking::Bool=false)
+    if use_nonblocking_synchronization && !blocking
         fast_synchronization(isdone, event)
     end
     cuEventSynchronize(event)
diff --git a/perf/byval.jl b/perf/byval.jl
index 2ad777319e..d32d62a9c6 100644
--- a/perf/byval.jl
+++ b/perf/byval.jl
@@ -59,11 +59,11 @@ function main()
     y1 = [similar(x1[1]) for i = 1:num_z_slices]
 
     # reference down to bones add on GPU
-    results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])
+    results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
 
     # adding arrays in an array
     for slices = 1:num_z_slices
-        results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+        results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
diff --git a/perf/cudadevrt.jl b/perf/cudadevrt.jl
index f166dd373a..08348fab5a 100644
--- a/perf/cudadevrt.jl
+++ b/perf/cudadevrt.jl
@@ -26,7 +26,7 @@ function main()
     x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
     y1 = similar(x1)
 
-    results = @benchmark CUDA.@sync add!($y1, $x1, $x2)
+    results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
     CUDA.unsafe_free!(x1)
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 9e8ef68608..1893c55f14 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -17,7 +17,7 @@ end
 # convenience macro to create a benchmark that requires synchronizing the GPU
 macro async_benchmarkable(ex...)
     quote
-        @benchmarkable CUDA.@sync $(ex...)
+        @benchmarkable CUDA.@sync blocking=true $(ex...)
     end
 end
 
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
index 25841bb95f..5c7737f578 100644
--- a/perf/volumerhs.jl
+++ b/perf/volumerhs.jl
@@ -255,8 +255,8 @@ function main()
                 $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
                 $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
     results = @benchmark begin
-        CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
-                           threads=$threads, blocks=$nelem)
+        CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
+                                         threads=$threads, blocks=$nelem)
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
diff --git a/src/utilities.jl b/src/utilities.jl
index 1cce3bb07f..808694f565 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -1,8 +1,15 @@
 """
-    @sync ex
+    @sync [blocking=false] ex
 
 Run expression `ex` and synchronize the GPU afterwards.
 
+The `blocking` keyword argument determines how synchronization is performed. By default,
+non-blocking synchronization will be used, which gives other Julia tasks a chance to run
+while waiting for the GPU to finish. This may increase latency, so for short operations,
+or when benchmaring code that does not use multiple tasks, it may be beneficial to use
+blocking synchronization instead by setting `blocking=true`. Blocking synchronization
+can also be enabled globally by changing the `nonblocking_synchronization` preference.
+
 See also: [`synchronize`](@ref).
 """
 macro sync(ex...)
@@ -11,11 +18,14 @@ macro sync(ex...)
     kwargs = ex[1:end-1]
 
     # decode keyword arguments
+    blocking = false
     for kwarg in kwargs
         Meta.isexpr(kwarg, :(=)) || error("Invalid keyword argument $kwarg")
         key, val = kwarg.args
         if key == :blocking
-            Base.depwarn("the blocking keyword to @sync has been deprecated", :sync)
+            isa(val, Bool) ||
+                error("Invalid value for keyword argument $kwarg; expected Bool, got $(val)")
+            blocking = val
         else
             error("Unknown keyword argument $kwarg")
         end
@@ -23,7 +33,7 @@ macro sync(ex...)
 
     quote
         local ret = $(esc(code))
-        synchronize()
+        synchronize(; blocking=$blocking)
         ret
     end
 end
diff --git a/test/core/utils.jl b/test/core/utils.jl
index 8793fd4d45..ae57edbb7b 100644
--- a/test/core/utils.jl
+++ b/test/core/utils.jl
@@ -29,6 +29,8 @@ end
   end
   @test t >= 0
   @test ret == 42
+
+  CUDA.@sync blocking=true identity(nothing)
 end
 
 @testset "versioninfo" begin

From cf60dae85262ef8a0114dc29ab40909e5f669557 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 30 Aug 2023 12:37:07 +0200
Subject: [PATCH 2/2] Add synchronization benchmarks.

[skip tests]
---
 lib/cudadrv/synchronization.jl | 26 +++++++++++++-------------
 perf/cuda.jl                   | 14 ++++++++++++++
 perf/runbenchmarks.jl          |  1 +
 3 files changed, 28 insertions(+), 13 deletions(-)
 create mode 100644 perf/cuda.jl

diff --git a/lib/cudadrv/synchronization.jl b/lib/cudadrv/synchronization.jl
index 31c6fe38cb..8e200437d7 100644
--- a/lib/cudadrv/synchronization.jl
+++ b/lib/cudadrv/synchronization.jl
@@ -74,7 +74,7 @@ Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take)
 # the synchronization, when it returns true (indicating that the object is synchronized)
 # the actual synchronization API should be called again.
 
-function fast_synchronization(f, obj)
+function spinning_synchronization(f, obj)
     # fast path
     f(obj) && return true
 
@@ -164,9 +164,9 @@ function nonblocking_synchronize(val)
     return
 end
 
-function device_synchronize(; blocking::Bool=false)
+function device_synchronize(; blocking::Bool=false, spin::Bool=true)
     if use_nonblocking_synchronization && !blocking
-        if fast_synchronization(isdone, legacy_stream())
+        if spin && spinning_synchronization(isdone, legacy_stream())
             cuCtxSynchronize()
         else
             nonblocking_synchronize(context())
@@ -178,9 +178,9 @@ function device_synchronize(; blocking::Bool=false)
     check_exceptions()
 end
 
-function synchronize(stream::CuStream=stream(); blocking::Bool=false)
+function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
     if use_nonblocking_synchronization && !blocking
-        if fast_synchronization(isdone, stream)
+        if spin && spinning_synchronization(isdone, stream)
             cuStreamSynchronize(stream)
         else
             nonblocking_synchronize(stream)
@@ -192,9 +192,9 @@ function synchronize(stream::CuStream=stream(); blocking::Bool=false)
     check_exceptions()
 end
 
-function synchronize(event::CuEvent; blocking::Bool=false)
+function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
     if use_nonblocking_synchronization && !blocking
-        if fast_synchronization(isdone, event)
+        if spin && spinning_synchronization(isdone, event)
             cuEventSynchronize(event)
         else
             nonblocking_synchronize(event)
@@ -249,10 +249,10 @@ function nonblocking_synchronize(stream::CuStream)
     return
 end
 
-function device_synchronize(; blocking::Bool=false)
+function device_synchronize(; blocking::Bool=false, spin::Bool=true)
     if use_nonblocking_synchronization && !blocking
         stream = legacy_stream()
-        if !fast_synchronization(isdone, stream)
+        if !spin || !spinning_synchronization(isdone, stream)
             nonblocking_synchronize(stream)
         end
     end
@@ -261,9 +261,9 @@ function device_synchronize(; blocking::Bool=false)
     check_exceptions()
 end
 
-function synchronize(stream::CuStream=stream(); blocking::Bool=false)
+function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
     if use_nonblocking_synchronization && !blocking
-        if !fast_synchronization(isdone, stream)
+        if !spin || !spinning_synchronization(isdone, stream)
             nonblocking_synchronize(stream)
         end
     end
@@ -272,9 +272,9 @@ function synchronize(stream::CuStream=stream(); blocking::Bool=false)
     check_exceptions()
 end
 
-function synchronize(event::CuEvent; blocking::Bool=false)
+function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
     if use_nonblocking_synchronization && !blocking
-        fast_synchronization(isdone, event)
+        spin && spinning_synchronization(isdone, event)
     end
     cuEventSynchronize(event)
 end
diff --git a/perf/cuda.jl b/perf/cuda.jl
new file mode 100644
index 0000000000..5dbcbf0abc
--- /dev/null
+++ b/perf/cuda.jl
@@ -0,0 +1,14 @@
+group = addgroup!(SUITE, "cuda")
+
+let group = addgroup!(group, "synchronization")
+    let group = addgroup!(group, "stream")
+        group["blocking"] = @benchmarkable synchronize(blocking=true)
+        group["auto"] = @benchmarkable synchronize()
+        group["nonblocking"] = @benchmarkable synchronize(spin=false)
+    end
+    let group = addgroup!(group, "context")
+        group["blocking"] = @benchmarkable device_synchronize(blocking=true)
+        group["auto"] = @benchmarkable device_synchronize()
+        group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
+    end
+end
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 1893c55f14..5969dcd9b7 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -30,6 +30,7 @@ SUITE = BenchmarkGroup()
 
 # NOTE: don't use spaces in benchmark names (tobami/codespeed#256)
 
+include("cuda.jl")
 include("kernel.jl")
 include("array.jl")