diff --git a/lib/cudadrv/context.jl b/lib/cudadrv/context.jl index d8eec2420e..465d760d95 100644 --- a/lib/cudadrv/context.jl +++ b/lib/cudadrv/context.jl @@ -322,7 +322,9 @@ device_synchronize() = nonblocking_synchronize() @inline function nonblocking_synchronize() # perform as much of the sync as possible without blocking in CUDA. # XXX: remove this using a yield callback, or by synchronizing on a dedicated thread? - nonblocking_synchronize(legacy_stream()) + if CUDA._use_nonblocking_synchronize[] + nonblocking_synchronize(legacy_stream()) + end # even though the GPU should be idle now, CUDA hooks work to the actual API call. # see NVIDIA bug #3383169 for more details. diff --git a/lib/cudadrv/events.jl b/lib/cudadrv/events.jl index 8c4192d9c8..a9abeaa01a 100644 --- a/lib/cudadrv/events.jl +++ b/lib/cudadrv/events.jl @@ -52,7 +52,9 @@ Waits for an event to complete. function synchronize(e::CuEvent) # perform as much of the sync as possible without blocking in CUDA. # XXX: remove this using a yield callback, or by synchronizing on a dedicated thread? - nonblocking_synchronize(e) + if CUDA._use_nonblocking_synchronize[] + nonblocking_synchronize(e) + end # even though the GPU should be idle now, CUDA hooks work to the actual API call. # see NVIDIA bug #3383169 for more details. diff --git a/lib/cudadrv/stream.jl b/lib/cudadrv/stream.jl index 67f455a740..4e63cf463a 100644 --- a/lib/cudadrv/stream.jl +++ b/lib/cudadrv/stream.jl @@ -127,7 +127,9 @@ function synchronize(stream::CuStream=stream(); blocking=nothing) # perform as much of the sync as possible without blocking in CUDA. # XXX: remove this using a yield callback, or by synchronizing on a dedicated stream? - nonblocking_synchronize(stream) + if CUDA._use_nonblocking_synchronize[] + nonblocking_synchronize(stream) + end # even though the GPU should be idle now, CUDA hooks work to the actual API call. # see NVIDIA bug #3383169 for more details. diff --git a/src/initialization.jl b/src/initialization.jl index 9fd4455ff1..c0ddd1b0df 100644 --- a/src/initialization.jl +++ b/src/initialization.jl @@ -28,6 +28,8 @@ function functional(show_reason::Bool=false) return false end +const _use_nonblocking_synchronize = Ref{Bool}(true) + function __init__() precompiling = ccall(:jl_generating_output, Cint, ()) != 0 @@ -183,6 +185,10 @@ function __init__() end end + if haskey(ENV, "JULIA_CUDA_NONBLOCKING_SYNCHRONIZE") + _use_nonblocking_synchronize[] = parse(Bool, ENV["JULIA_CUDA_NONBLOCKING_SYNCHRONIZE"]) + end + _initialized[] = true end