Stream synchronization is slow when waiting on the event from CUDA #1910

utkinis · 2023-05-17T14:02:38Z

Describe the bug

Synchronizing streams in CUDA.jl can sometimes be slower by factor of ~200 compared to CUDA C, when waiting on the event signaled by CUDA, as implemented in the nonblocking_synchronize:

CUDA.jl/lib/cudadrv/stream.jl

Lines 159 to 188 in 883db71

    
           event = Base.Event() 
        
           launch(; stream) do 
        
               notify(event) 
        
           end 
        
           # if an error occurs, the callback may never fire, so use a timer to detect such cases 
        
           dev = device() 
        
           timer = Timer(0; interval=1) 
        
           Base.@sync begin 
        
               Threads.@spawn try 
        
                   device!(dev) 
        
                   while true 
        
                       try 
        
                           Base.wait(timer) 
        
                       catch err 
        
                           err isa EOFError && break 
        
                           rethrow() 
        
                       end 
        
                       if unsafe_cuStreamQuery(stream) != ERROR_NOT_READY 
        
                           break 
        
                       end 
        
                   end 
        
               finally 
        
                   notify(event) 
        
               end 
        
               Threads.@spawn begin 
        
                   Base.wait(event) 
        
                   close(timer) 
        
               end 
        
           end

Screenshots from Nsight Systems

The CUDA C version for reference:

The Julia version: Note the gaps between consecutive kernel runs when waiting for an event.

To reproduce

I use the following code to profile CUDA.jl version:

using CUDA
using NVTX

function mycopy!(dst,src)
    I = (blockIdx().x-1)*blockDim().x + threadIdx().x
    if I <= length(dst)
        @inbounds dst[I] = src[I]
    end
    return
end

function main(N)
    dst = CuArray{Float64}(undef,N)
    src = CuArray{Float64}(undef,N)
    nthreads = 256
    nblocks  = cld(N,nthreads)

    # warmup
    @cuda threads=nthreads blocks=nblocks mycopy!(dst,src)
    CUDA.synchronize()

    GC.enable(false)

    # profile
    CUDA.Profile.start()
    NVTX.@range "total" begin
        for _ in 1:10
            NVTX.@range "copy"        @cuda threads=nthreads blocks=nblocks mycopy!(dst,src)
            NVTX.@range "synchronize" CUDA.synchronize()
        end
    end
    CUDA.Profile.stop()

    GC.enable(true)

    return
end

main(parse(Int,ARGS[1]))

And the following code for a reference C implementation:

#include <cuda.h>
#include <cuda_profiler_api.h>
#include <nvtx3/nvToolsExt.h>

#include <stdlib.h>

__global__ void mycopy(double *dst, const double *src, const int n) {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx < n) {
        dst[idx] = src[idx];
    }
    return;
}

void run(const int N) {
    double *dst = nullptr, *src = nullptr;
    int nthreads = 256;
    int nblocks  = (N+nthreads-1)/nthreads;

    cudaMalloc(&dst, N*sizeof(double));
    cudaMalloc(&src, N*sizeof(double));

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaProfilerStart();
    nvtxRangePushA("total");
    for (int i = 0; i < 10; ++i) {
        nvtxRangePushA("copy")       ;mycopy<<<nblocks,nthreads,0,stream>>>(dst,src,N);nvtxRangePop();
        nvtxRangePushA("synchronize");cudaStreamSynchronize(stream)                   ;nvtxRangePop();
    }
    nvtxRangePop();
    cudaProfilerStop();

    cudaFree(dst);
    cudaFree(src);

    cudaStreamDestroy(stream);
}

int main(int argc, const char *argv[]) {
    run(atoi(argv[1]));
    return 0;
}

Manifest.toml

# This file is machine-generated - editing it directly is not advised

julia_version = "1.9.0"
manifest_format = "2.0"
project_hash = "a5d5e756e5cdcbae1f9c5f15571cf03d6bb2fd55"

[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.3.1"

    [deps.AbstractFFTs.extensions]
    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"

    [deps.AbstractFFTs.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.6.2"
weakdeps = ["StaticArrays"]

    [deps.Adapt.extensions]
    AdaptStaticArraysExt = "StaticArrays"

[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.1"

[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"

[[deps.Atomix]]
deps = ["UnsafeAtomics"]
git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
version = "0.1.0"

[[deps.BFloat16s]]
deps = ["LinearAlgebra", "Printf", "Random", "Test"]
git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
version = "0.4.2"

[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

[[deps.CEnum]]
git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
version = "0.4.2"

[[deps.CUDA]]
deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "280893f920654ebfaaaa1999fbd975689051f890"
uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
version = "4.2.0"

[[deps.CUDA_Driver_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "498f45593f6ddc0adff64a9310bb6710e851781b"
uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
version = "0.5.0+1"

[[deps.CUDA_Runtime_Discovery]]
deps = ["Libdl"]
git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536"
uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
version = "0.2.2"

[[deps.CUDA_Runtime_jll]]
deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "5248d9c45712e51e27ba9b30eebec65658c6ce29"
uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
version = "0.6.0+0"

[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.4"

[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.10"

[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.2+0"

[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[deps.DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.9.3"

[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
version = "1.6.0"

[[deps.ExprTools]]
git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
version = "0.1.9"

[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"

[[deps.FixedPointNumbers]]
deps = ["Statistics"]
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
version = "0.8.4"

[[deps.GPUArrays]]
deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
git-tree-sha1 = "9ade6983c3dbbd492cf5729f865fe030d1541463"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.6.6"

[[deps.GPUArraysCore]]
deps = ["Adapt"]
git-tree-sha1 = "1cd7f0af1aa58abc02ea1d872953a97359cb87fa"
uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
version = "0.1.4"

[[deps.GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "5737dc242dadd392d934ee330c69ceff47f0259c"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.19.4"

[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[deps.IrrationalConstants]]
git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.2.2"

[[deps.JLLWrappers]]
deps = ["Preferences"]
git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.4.1"

[[deps.JuliaNVTXCallbacks_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
version = "0.2.1+0"

[[deps.KernelAbstractions]]
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
git-tree-sha1 = "47be64f040a7ece575c2b5f53ca6da7b548d69f4"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
version = "0.9.4"

[[deps.LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "26a31cdd9f1f4ea74f649a7bf249703c687a953d"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "5.1.0"

[[deps.LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35"
uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
version = "0.0.21+0"

[[deps.LazyArtifacts]]
deps = ["Artifacts", "Pkg"]
uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"

[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.6.3"

[[deps.LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
version = "7.84.0+0"

[[deps.LibGit2]]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

[[deps.LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
version = "1.10.2+0"

[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "0a1b7c2863e44523180fdb3146534e265a91870b"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.23"

    [deps.LogExpFunctions.extensions]
    LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
    LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
    LogExpFunctionsInverseFunctionsExt = "InverseFunctions"

    [deps.LogExpFunctions.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
    ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"

[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[deps.MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.10"

[[deps.Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[deps.MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.28.2+0"

[[deps.MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2022.10.11"

[[deps.NVTX]]
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
git-tree-sha1 = "c1bfdab07fa259baa4604a05862c43004dd3cd45"
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
version = "0.3.1"

[[deps.NVTX_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
version = "3.1.0+2"

[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"

[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"

[[deps.OpenLibm_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
version = "0.8.1+0"

[[deps.OpenSpecFun_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
version = "0.5.5+0"

[[deps.Pkg]]
deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
version = "1.9.0"

[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "259e206946c293698122f63e2b513a7c99a244e8"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.1.1"

[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.4.0"

[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[[deps.REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"

[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[deps.Random123]]
deps = ["Random", "RandomNumbers"]
git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3"
uuid = "74087812-796a-5b5d-8853-05524746bad3"
version = "1.6.1"

[[deps.RandomNumbers]]
deps = ["Random", "Requires"]
git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
version = "1.5.3"

[[deps.Reexport]]
git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.2.2"

[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"

[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"

[[deps.Scratch]]
deps = ["Dates"]
git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
uuid = "6c6a2e73-6563-6170-7368-637461726353"
version = "1.2.0"

[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

[[deps.Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

[[deps.SparseArrays]]
deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[[deps.SpecialFunctions]]
deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
version = "2.2.0"

    [deps.SpecialFunctions.extensions]
    SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"

    [deps.SpecialFunctions.weakdeps]
    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"

[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
git-tree-sha1 = "8982b3607a212b070a5e46eea83eb62b4744ae12"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.5.25"

[[deps.StaticArraysCore]]
git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.0"

[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.9.0"

[[deps.SuiteSparse_jll]]
deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
version = "5.10.1+6"

[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"

[[deps.Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
version = "1.10.0"

[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.TimerOutputs]]
deps = ["ExprTools", "Printf"]
git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.23"

[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
version = "0.2.1"

[[deps.UnsafeAtomicsLLVM]]
deps = ["LLVM", "UnsafeAtomics"]
git-tree-sha1 = "ea37e6066bf194ab78f4e747f5245261f17a7175"
uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
version = "0.1.2"

[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
version = "1.2.13+0"

[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.7.0+0"

[[deps.nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
version = "1.48.0+0"

[[deps.p7zip_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "17.4.0+0"

Expected behavior

Synchronizing streams in CUDA.jl should be comparable to CUDA C in performance.

Version info

Details on Julia:

Julia Version 1.9.0
Commit 8e630552924 (2023-05-07 11:25 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 32 × AMD EPYC 7282 16-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, znver2)
  Threads: 4 on 32 virtual cores
Environment:
  JULIA_HDF5_PATH =
  JULIA_LOAD_PATH = :/scratch-1/julia_prefs/
  JULIA_MPI_BINARY = system
  JULIA_CUDA_USE_BINARYBUILDER = false
  LD_LIBRARY_PATH = /usr/local/cuda-11.4/lib64
  JULIA_NUM_THREADS = 4

Details on CUDA:

CUDA runtime 11.4, local installation
CUDA driver 12.1
NVIDIA driver 470.103.1, originally for CUDA 11.4

Libraries:
- CUBLAS: 11.6.5
- CURAND: 10.2.5
- CUFFT: 10.5.2
- CUSOLVER: 11.2.0
- CUSPARSE: 11.6.0
- CUPTI: 14.0.0
- NVML: 11.0.0+470.103.1

Toolchain:
- Julia: 1.9.0
- LLVM: 14.0.6
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0, 7.1, 7.2, 7.3, 7.4
- Device capability support: sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86

Environment:
- JULIA_CUDA_USE_BINARYBUILDER: false

8 devices:
  0: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)
  1: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)
  2: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)
  3: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)
  4: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)
  5: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)
  6: NVIDIA A100-SXM4-40GB (sm_80, 38.630 GiB / 39.586 GiB available)
  7: NVIDIA A100-SXM4-40GB (sm_80, 38.609 GiB / 39.586 GiB available)

Additional context

My use case is a physics simulation running in a multi-GPU and multi-node environment. I use KernelAbstractions.jl for backend-agnostic kernels, and MPI.jl for communication. For scalability, the computations on GPU and MPI communications need to overlap. I use the following pattern for hiding the communication behind computations:

for it in 1:nt
    # update some set of fields
    update_A!(args_A...)
    synchronize()
    @sync begin
        @async begin
            # update other set of fields depending on A on the outer border of computational grid
            update_B_border!(args_B...)
            synchronize()
            exchange_mpi!(mpi_args...)
        end
        @async begin
            # update B in the inner points of the domain
            update_B_inner!(args_B...)
            synchronize()
        end
    end
end

I rely on tasks to run device-to-host copy and MPI communication, and I need synchronization after calling update_A! to avoid data race with kernels in nested tasks which depend on the results of update_A!. I noticed that the stream synchronization becomes a bottleneck for our typical kernel runtimes. On top of that, the waiting times are non-uniform, spiking randomly to up to several milliseconds, which is orders of magnitude slower than just calling cudaStreamSynchronize, and is longer that our typical kernel running times. With many MPI processes, the chances that even one of the processes would give a spike are high, leading to all MPI processes having to wait almost every loop iteration.

The text was updated successfully, but these errors were encountered:

maleadt · 2023-05-17T14:06:22Z

Are you doing work on other tasks that would explain why the scheduler doesn't immediately continue with the synchronization task? Could you extend our nonblocking_synchronization with some NVXT ranges and events to figure out where the delay comes from? It's also possible that CUDA is just slow to launch the stream callback.

utkinis · 2023-05-19T11:36:00Z

@maleadt I only have one task in my benchmark, and the number of threads doesn't change much (I run julia with -t 2 here).

Command to run the profiler:

$ nsys profile --force-overwrite=true --trace=cuda,nvtx --capture-range=cudaProfilerApi --capture-range-end=stop --output=copy_julia_long_t2 julia --project -O3 -t 2 copy_
bench.jl 70000000

Here's the extended profile with NVTX info (purple bars are number of spins in busy-wait, green bar is querying stream after waiting for the timer):

Full profile: copy_julia_long_t2.zip

Apparently, slow part is the wating for an event. At first, I thought that the thread spawn is the issue, but replacing Threads.@spawn with @async didn't really make a difference.

The code for `nonblocking_synchronize` with NVTX ranges/events

@inline function nonblocking_synchronize(stream::CuStream)
    # fast path
    isdone(stream) && return

    NVTX.@range "busy-wait" begin
        # minimize latency of short operations by busy-waiting,
        # initially without even yielding to other tasks
        spins = 0
        while spins < 256
            if spins < 32
                ccall(:jl_cpu_pause, Cvoid, ())
                # Temporary solution before we have gc transition support in codegen.
                ccall(:jl_gc_safepoint, Cvoid, ())
            else
                NVTX.@mark "yield" payload=spins
                yield()
            end
            isdone(stream) && return
            spins += 1
        end
    end

    NVTX.@range "wait for an event" begin
        # minimize CPU usage of long-running kernels by waiting for an event signalled by CUDA
        event = Base.Event()
        launch(; stream) do
            notify(event)
        end

        NVTX.@mark "launched CUDA function"

        # if an error occurs, the callback may never fire, so use a timer to detect such cases
        dev = device()

        NVTX.@mark "switched to current device"

        timer = Timer(0; interval=1)

        NVTX.@range "spawn threads and sync" begin
            Base.@sync begin
                Threads.@spawn begin
                    NVTX.@range "wait for timer and check" begin
                        try
                            device!(dev)
                            while true
                                try
                                    Base.wait(timer)
                                catch err
                                    err isa EOFError && break
                                    rethrow()
                                end
                                if unsafe_cuStreamQuery(stream) != ERROR_NOT_READY
                                    break
                                end
                                NVTX.@mark "checked stream"
                            end
                        finally
                            notify(event)
                        end
                    end
                end

                Threads.@spawn begin
                    NVTX.@range "wait for event and close timer" begin
                        Base.wait(event)
                        close(timer)
                    end
                end
            end
        end
    end

    return
end

jpdoane · 2023-05-26T20:51:55Z

Possibly related to this issue we ran into a while ago, where the nonblocking_synchronize() timeout was stalling due to I/O on the main thread. Our hacky workaround at the time was to use a local CUDA.jl branch with nonblocking_synchronize() disabled.

I haven't revisited this in a while, so not sure if it was ever resolved, but your issue may be related

utkinis · 2023-05-30T17:08:53Z

@jpdoane Thanks for the link! The discussion was an interesting read, and it seems like indeed the issue here is the same. I think the solution proposed by @maleadt in the discussion here, i.e., having only a busy-loop with yield, is a good way to go. As a compromise, we could add a keyword to synchronize for a low-latency version, to not cause slow-downs in someone else's code. I will check if that fixes the issue and prepare a PR soon.

maleadt · 2023-05-31T07:29:56Z

As a compromise, we could add a keyword to synchronize for a low-latency version, to not cause slow-downs in someone else's code. I will check if that fixes the issue and prepare a PR soon.

As synchronize may be called by non-toplevel code (e.g. array copy functions), maybe it's better to introduce a preference instead?

aaustin141 · 2023-07-29T21:18:47Z

I'm experiencing this issue, too:

@vchuravy suggested that this can be made faster in 1.10?

maleadt · 2023-07-29T21:44:30Z

@vchuravy Can you elaborate on your suggestion? IIUC, wait for on a Condition, and have a @cfunction that gets called by CUDA (on a thread that gets adopted by Julia) notify a condition, which is faster than the current callback because it doesn't rely on libuv?

This addresses JuliaGPU#1910 by adding the boolean environment variable `JULIA_CUDA_NONBLOCKING_SYNCHRONIZE` to control if nonblocking synchronizes are used or not.

maleadt · 2023-08-14T08:00:43Z

Should be fixed by #2025. Synchronization is still slower than CUDA C, but it's much faster than the previously (from 150us or so down to 5us, while CUDA C is 0.5us). In case that's still too much, there's a preference to disable nonblocking synchronization.

utkinis added the bug Something isn't working label May 17, 2023

lcw added a commit to lcw/CUDA.jl that referenced this issue Jul 31, 2023

Make nonblocking synchronize optional

ef845ec

This addresses JuliaGPU#1910 by adding the boolean environment variable `JULIA_CUDA_NONBLOCKING_SYNCHRONIZE` to control if nonblocking synchronizes are used or not.

lcw added a commit to lcw/CUDA.jl that referenced this issue Jul 31, 2023

Make nonblocking synchronize optional

9223144

This addresses JuliaGPU#1910 by adding the boolean environment variable `JULIA_CUDA_NONBLOCKING_SYNCHRONIZE` to control if nonblocking synchronizes are used or not.

lcw mentioned this issue Jul 31, 2023

Make nonblocking synchronize optional #2014

Closed

maleadt closed this as completed Aug 14, 2023

simonbyrne mentioned this issue Oct 17, 2023

Investigate causes of poor scaling in multi-GPU runs CliMA/ClimaAtmos.jl#2222

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Stream synchronization is slow when waiting on the event from CUDA #1910

Stream synchronization is slow when waiting on the event from CUDA #1910

utkinis commented May 17, 2023

maleadt commented May 17, 2023

utkinis commented May 19, 2023 •

edited

Loading

jpdoane commented May 26, 2023

utkinis commented May 30, 2023

maleadt commented May 31, 2023

aaustin141 commented Jul 29, 2023

maleadt commented Jul 29, 2023

maleadt commented Aug 14, 2023

Stream synchronization is slow when waiting on the event from CUDA #1910

Stream synchronization is slow when waiting on the event from CUDA #1910

Comments

utkinis commented May 17, 2023

maleadt commented May 17, 2023

utkinis commented May 19, 2023 • edited Loading

jpdoane commented May 26, 2023

utkinis commented May 30, 2023

maleadt commented May 31, 2023

aaustin141 commented Jul 29, 2023

maleadt commented Jul 29, 2023

maleadt commented Aug 14, 2023

utkinis commented May 19, 2023 •

edited

Loading