From 3ce5b0f3734924e17fa2c4b21d071ea676555f99 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 9 Aug 2023 15:01:31 +0200
Subject: [PATCH 01/12] Add a native profiler.

---
 Manifest.toml          | 110 +++++++
 Project.toml           |   7 +
 lib/cudadrv/CUDAdrv.jl |   1 -
 lib/cudadrv/profile.jl | 106 -------
 lib/cupti/wrappers.jl  | 229 ++++++++++++++
 src/CUDA.jl            |   1 +
 src/profile.jl         | 658 +++++++++++++++++++++++++++++++++++++++++
 test/base/aqua.jl      |   2 +-
 test/core/cudadrv.jl   |  11 -
 test/core/profile.jl   |  81 +++++
 10 files changed, 1087 insertions(+), 119 deletions(-)
 delete mode 100644 lib/cudadrv/profile.jl
 create mode 100644 src/profile.jl
 create mode 100644 test/core/profile.jl

diff --git a/Manifest.toml b/Manifest.toml
index 6af9a4a3af..6c5e435e02 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -78,6 +78,33 @@ version = "4.9.0"
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
 
+[[Crayons]]
+git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
+uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+version = "4.1.1"
+
+[[DataAPI]]
+git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
+uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+version = "1.15.0"
+
+[[DataFrames]]
+deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
+git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
+uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+version = "1.6.1"
+
+[[DataStructures]]
+deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
+git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d"
+uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+version = "0.18.15"
+
+[[DataValueInterfaces]]
+git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
+uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
+version = "1.0.0"
+
 [[Dates]]
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -97,6 +124,10 @@ git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 version = "0.1.10"
 
+[[Future]]
+deps = ["Random"]
+uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
+
 [[GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
 git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1"
@@ -115,6 +146,12 @@ git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 version = "0.21.4"
 
+[[InlineStrings]]
+deps = ["Parsers"]
+git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
+uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
+version = "1.4.0"
+
 [[InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
@@ -125,11 +162,21 @@ git-tree-sha1 = "68772f49f54b479fa88ace904f6127f0a3bb2e46"
 uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
 version = "0.1.12"
 
+[[InvertedIndices]]
+git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
+uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
+version = "1.3.0"
+
 [[IrrationalConstants]]
 git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
 version = "0.2.2"
 
+[[IteratorInterfaceExtensions]]
+git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
+uuid = "82899510-4779-5014-852e-03e436cf321d"
+version = "1.0.0"
+
 [[JLLWrappers]]
 deps = ["Preferences"]
 git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
@@ -154,6 +201,11 @@ git-tree-sha1 = "c35203c1e1002747da220ffc3c0762ce7754b08c"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
 version = "0.0.23+0"
 
+[[LaTeXStrings]]
+git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996"
+uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
+version = "1.3.0"
+
 [[LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
 uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
@@ -204,6 +256,12 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 deps = ["Artifacts", "Libdl"]
 uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
 
+[[Missings]]
+deps = ["DataAPI"]
+git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272"
+uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+version = "1.1.0"
+
 [[MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
 
@@ -220,10 +278,27 @@ git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.5+0"
 
+[[OrderedCollections]]
+git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
+uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+version = "1.6.2"
+
+[[Parsers]]
+deps = ["Dates", "PrecompileTools", "UUIDs"]
+git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "2.7.2"
+
 [[Pkg]]
 deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
+[[PooledArrays]]
+deps = ["DataAPI", "Future"]
+git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7"
+uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+version = "1.4.2"
+
 [[PrecompileTools]]
 deps = ["Preferences"]
 git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81"
@@ -236,6 +311,12 @@ git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1"
 uuid = "21216c6a-2e73-6563-6e65-726566657250"
 version = "1.4.0"
 
+[[PrettyTables]]
+deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"]
+git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a"
+uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+version = "2.2.7"
+
 [[Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -280,12 +361,24 @@ git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
 uuid = "6c6a2e73-6563-6170-7368-637461726353"
 version = "1.2.0"
 
+[[SentinelArrays]]
+deps = ["Dates", "Random"]
+git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39"
+uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
+version = "1.4.0"
+
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
+[[SortingAlgorithms]]
+deps = ["DataStructures"]
+git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee"
+uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
+version = "1.1.1"
+
 [[SparseArrays]]
 deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -311,10 +404,27 @@ version = "1.4.2"
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
+[[StringManipulation]]
+git-tree-sha1 = "46da2434b41f41ac3594ee9816ce5541c6096123"
+uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
+version = "0.3.0"
+
 [[TOML]]
 deps = ["Dates"]
 uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
+[[TableTraits]]
+deps = ["IteratorInterfaceExtensions"]
+git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
+uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
+version = "1.0.1"
+
+[[Tables]]
+deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"]
+git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec"
+uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+version = "1.10.1"
+
 [[Tar]]
 deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
diff --git a/Project.toml b/Project.toml
index 6ca22c9a4e..1c9fea40d9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,8 @@ CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CUDA_Driver_jll = "4ee394cb-3365-5eb0-8335-949819d2adfc"
 CUDA_Runtime_Discovery = "1af6417a-86b4-443c-805f-a4643ffb695f"
 CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
@@ -20,6 +22,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
+PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Random123 = "74087812-796a-5b5d-8853-05524746bad3"
@@ -28,6 +31,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [compat]
@@ -38,12 +42,15 @@ CEnum = "0.2, 0.3, 0.4"
 CUDA_Driver_jll = "0.5"
 CUDA_Runtime_Discovery = "0.2"
 CUDA_Runtime_jll = "0.6"
+Crayons = "4"
+DataFrames = "1"
 ExprTools = "0.1"
 GPUArrays = "8.6"
 GPUCompiler = "0.21"
 KernelAbstractions = "0.9.2"
 LLVM = "6"
 Preferences = "1"
+PrettyTables = "2"
 Random123 = "1.2"
 RandomNumbers = "1.5.3"
 Reexport = "0.2, 1.0"
diff --git a/lib/cudadrv/CUDAdrv.jl b/lib/cudadrv/CUDAdrv.jl
index 0e4f0c0a92..2932f30c9d 100644
--- a/lib/cudadrv/CUDAdrv.jl
+++ b/lib/cudadrv/CUDAdrv.jl
@@ -23,7 +23,6 @@ include("memory.jl")
 include("module.jl")
 include("events.jl")
 include("execution.jl")
-include("profile.jl")
 include("occupancy.jl")
 include("graph.jl")
 
diff --git a/lib/cudadrv/profile.jl b/lib/cudadrv/profile.jl
deleted file mode 100644
index 99284593ce..0000000000
--- a/lib/cudadrv/profile.jl
+++ /dev/null
@@ -1,106 +0,0 @@
-# Profiler control
-
-"""
-    @profile ex
-
-Run expressions while activating the CUDA profiler.
-
-Note that this API is used to programmatically control the profiling granularity by allowing
-profiling to be done only on selective pieces of code. It does not perform any profiling on
-itself, you need external tools for that.
-"""
-macro profile(ex)
-    quote
-        Profile.start()
-        try
-            $(esc(ex))
-        finally
-            Profile.stop()
-        end
-    end
-end
-
-
-module Profile
-
-using ..CUDA
-
-function find_nsys()
-    if haskey(ENV, "JULIA_CUDA_NSYS")
-        return ENV["JULIA_CUDA_NSYS"]
-    elseif haskey(ENV, "_") && contains(ENV["_"], r"nsys"i)
-        # NOTE: if running as e.g. Jupyter -> nsys -> Julia, _ is `jupyter`
-        return ENV["_"]
-    else
-        # look at a couple of environment variables that may point to NSight
-        nsight = nothing
-        for var in ("LD_PRELOAD", "CUDA_INJECTION64_PATH", "NVTX_INJECTION64_PATH")
-            haskey(ENV, var) || continue
-            for val in split(ENV[var], Sys.iswindows() ? ';' : ':')
-                isfile(val) || continue
-                candidate = if Sys.iswindows()
-                    joinpath(dirname(val), "nsys.exe")
-                else
-                    joinpath(dirname(val), "nsys")
-                end
-                isfile(candidate) && return candidate
-            end
-        end
-    end
-    error("Running under Nsight Systems, but could not find the `nsys` binary to start the profiler. Please specify using JULIA_CUDA_NSYS=path/to/nsys, and file an issue with the contents of ENV.")
-end
-
-const __nsight = Ref{Union{Nothing,String}}()
-function nsight()
-    if !isassigned(__nsight)
-        # find the active Nsight Systems profiler
-        if haskey(ENV, "NSYS_PROFILING_SESSION_ID") && ccall(:jl_generating_output, Cint, ()) == 0
-            __nsight[] = find_nsys()
-            @assert isfile(__nsight[])
-            @info "Running under Nsight Systems, CUDA.@profile will automatically start the profiler"
-        else
-            __nsight[] = nothing
-        end
-    end
-
-    __nsight[]
-end
-
-
-"""
-    start()
-
-Enables profile collection by the active profiling tool for the current context. If
-profiling is already enabled, then this call has no effect.
-"""
-function start()
-    if nsight() !== nothing
-        run(`$(nsight()) start --capture-range=cudaProfilerApi`)
-        # it takes a while for the profiler to actually start tracing our process
-        sleep(0.01)
-    else
-        @warn("""Calling CUDA.@profile only informs an external profiler to start.
-                 The user is responsible for launching Julia under a CUDA profiler.
-
-                 It is recommended to use Nsight Systems, which supports interactive profiling:
-                 \$ nsys launch julia""",
-              maxlog=1)
-    end
-    CUDA.cuProfilerStart()
-end
-
-"""
-    stop()
-
-Disables profile collection by the active profiling tool for the current context. If
-profiling is already disabled, then this call has no effect.
-"""
-function stop()
-    CUDA.cuProfilerStop()
-    if nsight() !== nothing
-        @info """Profiling has finished, open the report listed above with `nsys-ui`
-                 If no report was generated, try launching `nsys` with `--trace=cuda`"""
-    end
-end
-
-end
diff --git a/lib/cupti/wrappers.jl b/lib/cupti/wrappers.jl
index 6e81f01503..5f0e8dd73b 100644
--- a/lib/cupti/wrappers.jl
+++ b/lib/cupti/wrappers.jl
@@ -3,3 +3,232 @@ function version()
     cuptiGetVersion(version_ref)
     VersionNumber(version_ref[])
 end
+
+
+#
+# activity API
+#
+
+"""
+    cfg = ActvitiyConfig(activity_kinds)
+
+    enable!(cfg)
+    # do stuff
+    disable!(cfg)
+
+    process(cfg) do (ctx, stream, record)
+        # inspect record
+    end
+
+High-level interface to the CUPTI activity API.
+"""
+struct ActivityConfig
+    activity_kinds::Vector{CUPTI.CUpti_ActivityKind}
+
+    available_buffers::Vector{Vector{UInt8}}
+    active_buffers::Vector{Vector{UInt8}}
+
+    results::Vector{Any}
+end
+
+function ActivityConfig(activity_kinds)
+    # pre-allocate a couple of buffers to avoid allocating while profiling
+    available_buffers = [allocate_buffer() for _ in 1:5]
+
+    ActivityConfig(activity_kinds, available_buffers, Vector{UInt8}[], [])
+end
+
+function allocate_buffer()
+    # "for typical workloads, it's suggested to choose a size between 1 and 10 MB."
+    Array{UInt8}(undef, 8 * 1024 * 1024)  # 8 MB
+end
+
+const activity_config = Ref{Union{Nothing,ActivityConfig}}(nothing)
+
+function request_buffer(dest_ptr, sz_ptr, max_num_records_ptr)
+    # this function is called by CUPTI, but directly from the application, so it should be
+    # fine to perform I/O or allocate memory here.
+
+    dest = Base.unsafe_wrap(Array, dest_ptr, 1)
+    sz = Base.unsafe_wrap(Array, sz_ptr, 1)
+    max_num_records = Base.unsafe_wrap(Array, max_num_records_ptr, 1)
+
+    cfg = activity_config[]
+    if cfg !== nothing
+        buf = if isempty(cfg.available_buffers)
+            allocate_buffer()
+        else
+            pop!(cfg.available_buffers)
+        end
+        ptr = pointer(buf)
+        push!(cfg.active_buffers, buf)
+        sizehint!(cfg.results, length(cfg.active_buffers))
+
+        dest[] = pointer(buf)
+        sz[] = sizeof(buf)
+        max_num_records[] = 0
+    else
+        dest[] = C_NULL
+        sz[] = 0
+        max_num_records[] = 0
+    end
+
+    return
+end
+
+function complete_buffer(ctx_handle, stream_id, buf_ptr, sz, valid_sz)
+    # this function is called by a CUPTI worker thread while our application may be waiting
+    # for `cuptiActivityFlushAll` to complete. that means we cannot do I/O here, or we could
+    # yield while the application cannot make any progress.
+    #
+    # we also may not trigger GC, because the main application cannot reach a safepoint.
+    # to prevent this, we call `sizehint!` in `request_buffer`.
+    # XXX: `sizehint!` isn't a guarantee; use `resize!` and a cursor?
+
+    cfg = activity_config[]
+    if cfg !== nothing
+        push!(cfg.results, (ctx_handle, stream_id, buf_ptr, sz, valid_sz))
+    end
+
+    return
+end
+
+function enable!(cfg::ActivityConfig)
+    activity_config[] === nothing ||
+        error("Only one profiling session can be active at a time.")
+
+    # set-up callbacks
+    request_buffer_ptr = @cfunction(request_buffer, Cvoid,
+                                    (Ptr{Ptr{UInt8}}, Ptr{Csize_t}, Ptr{Csize_t}))
+    complete_buffer_ptr = @cfunction(complete_buffer, Cvoid,
+                                     (CUDA.CUcontext, UInt32, Ptr{UInt8}, Csize_t, Csize_t))
+    CUPTI.cuptiActivityRegisterCallbacks(request_buffer_ptr, complete_buffer_ptr)
+
+    activity_config[] = cfg
+
+    # enable requested activity kinds
+    for activity_kind in cfg.activity_kinds
+        CUPTI.cuptiActivityEnable(activity_kind)
+    end
+end
+
+function disable!(cfg::ActivityConfig)
+    if activity_config[] !== cfg
+        error("This profiling session is not active.")
+    end
+
+    # disable activity kinds
+    for activity_kind in cfg.activity_kinds
+        CUPTI.cuptiActivityDisable(activity_kind)
+    end
+
+    # flush all activity records, even incomplete ones
+    CUPTI.cuptiActivityFlushAll(CUPTI.CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)
+
+    activity_config[] = nothing
+
+    return
+end
+
+function process(f, cfg::ActivityConfig)
+    activity_types = Dict(
+        CUPTI.CUPTI_ACTIVITY_KIND_DRIVER                => CUPTI.CUpti_ActivityAPI,
+        CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME               => CUPTI.CUpti_ActivityAPI,
+        CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API   => CUPTI.CUpti_ActivityAPI,
+    )
+    # NOTE: the CUPTI version is unreliable, e.g., both CUDA 11.5 and 11.6 have CUPTI 16,
+    #       yet CUpti_ActivityMemset4 is only available in CUDA 11.6.
+    cuda_version = CUDA.runtime_version()
+    ## kernel activities
+    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_KERNEL] =
+        if cuda_version >= v"12.0"
+            CUPTI.CUpti_ActivityKernel5
+        elseif cuda_version >= v"11.8"
+            CUPTI.CUpti_ActivityKernel8
+        elseif cuda_version >= v"11.6"
+            CUPTI.CUpti_ActivityKernel7
+        elseif cuda_version >= v"11.2"
+            CUPTI.CUpti_ActivityKernel6
+        elseif cuda_version >= v"11.1"
+            CUPTI.CUpti_ActivityKernel5
+        else # v"11.0"
+            CUPTI.CUpti_ActivityKernel4
+        end
+    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL] =
+        activity_types[CUPTI.CUPTI_ACTIVITY_KIND_KERNEL]
+    ## memcpy activities
+    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_MEMCPY] =
+        if cuda_version >= v"11.6"
+            CUPTI.CUpti_ActivityMemcpy5
+        elseif cuda_version >= v"11.1"
+            CUPTI.CUpti_ActivityMemcpy4
+        else # v"11.0"
+            CUPTI.CUpti_ActivityMemset3
+        end
+    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_MEMSET] =
+        if cuda_version >= v"11.6"
+            CUPTI.CUpti_ActivityMemset4
+        elseif cuda_version >= v"11.1"
+            CUPTI.CUpti_ActivityMemset3
+        else # v"11.0"
+            CUPTI.CUpti_ActivityMemset2
+        end
+    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2] =
+        if cuda_version >= v"11.6"
+            CUPTI.CUpti_ActivityMemory3
+        elseif cuda_version >= v"11.2"
+            CUPTI.CUpti_ActivityMemory2
+        else # v"9.0"
+            CUPTI.CUpti_ActivityMemory
+        end
+
+    # extract typed activity records
+    for (ctx_handle, stream_id, buf_ptr, sz, valid_sz) in cfg.results
+        ctx = CUDA._CuContext(ctx_handle)
+        # XXX: can we reconstruct the stream from the stream ID?
+
+        record_ptr = Ref{Ptr{CUPTI.CUpti_Activity}}(C_NULL)
+        while true
+            try
+                CUPTI.cuptiActivityGetNextRecord(buf_ptr, valid_sz, record_ptr)
+                record = unsafe_load(record_ptr[])
+
+                if haskey(activity_types, record.kind)
+                    typ = activity_types[record.kind]
+                    typed_ptr = convert(Ptr{typ}, record_ptr[])
+                    typed_record = unsafe_load(typed_ptr)
+                    f(ctx, stream_id, typed_record)
+                else
+                    @warn "Unsupported activity kind: $(record.kind)"
+                end
+            catch err
+                if isa(err, CUPTIError) && err.code == CUPTI.CUPTI_ERROR_MAX_LIMIT_REACHED
+                    break
+                end
+                rethrow()
+            end
+        end
+    end
+end
+
+function Base.string(memory_kind::CUpti_ActivityMemoryKind)
+    if memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN
+        "unknown"
+    elseif memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE
+        "pageable"
+    elseif memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_PINNED
+        "pinned"
+    elseif memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE
+        "device"
+    elseif memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_ARRAY
+        "array"
+    elseif memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_MANAGED
+        "managed"
+    elseif memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC
+        "device static"
+    elseif memory_kind == CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC
+        "managed static"
+    else
+        "unknown"
+    end
+end
diff --git a/src/CUDA.jl b/src/CUDA.jl
index a070e18615..773d6c5275 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -85,6 +85,7 @@ include("accumulate.jl")
 include("reverse.jl")
 include("iterator.jl")
 include("sorting.jl")
+include("profile.jl")
 
 # array libraries
 include("../lib/complex.jl")
diff --git a/src/profile.jl b/src/profile.jl
new file mode 100644
index 0000000000..22f7be03b1
--- /dev/null
+++ b/src/profile.jl
@@ -0,0 +1,658 @@
+# Profiler control
+
+"""
+    @profile [io=stdout] [host=true] [device=true] [trace=false] [raw=false] code...
+    @profile external=true code...
+
+Profile the GPU execution of `code`.
+
+There are two modes of operation, depending on whether `external` is `true` or `false`.
+
+## Native profiling (`external=false`, the default)
+
+In this mode, CUDA.jl will profile the execution of `code` and display the result. By
+default, both host-side and device-side activity is captured; this can be controlled with
+the `host` and `device` keyword arguments. If `trace` is `true`, a chronological trace of
+the captured activity will be generated, where the ID column can be used to match host-side
+and device-side activity; by default, only a summary will be shown. If `raw` is `true`, all
+data will always be included, even if it may not be relevant. The output will be written to
+`io`, which defaults to `stdout`.
+
+Slow operations will be highlighted in the output: Entries colored in yellow are among the
+slowest 25%, while entries colored in red are among the slowest 5% of all operations.
+
+!!! compat "Julia 1.9"
+    This functionality is only available on Julia 1.9 and later.
+
+## External profiling (`external=true`)
+
+For more advanced profiling, it is possible to use an external profiling tool, such as
+NSight Systems or NSight Compute. When doing so, it is often advisable to only enable the
+profiler for the specific code region of interest. This can be done by wrapping the code
+with `CUDA.@profile external=true`, which used to be the only way to use this macro.
+"""
+macro profile(ex...)
+    # destructure the `@profile` expression
+    code = ex[end]
+    kwargs = ex[1:end-1]
+
+    # extract keyword arguments that are handled by this macro
+    external = false
+    remaining_kwargs = []
+    for kwarg in kwargs
+        if Meta.isexpr(kwarg, :(=))
+            key, value = kwarg.args
+            if key == :external
+                isa(value, Bool) || throw(ArgumentError("Invalid value for keyword argument `external`: got `$value`, expected literal boolean value"))
+                external = value
+            else
+                push!(remaining_kwargs, kwarg)
+            end
+        else
+            throw(ArgumentError("Invalid keyword argument to CUDA.@profile: $kwarg"))
+        end
+    end
+
+    if external
+        Profile.emit_external_profile(code, remaining_kwargs)
+    else
+        Profile.emit_native_profile(code, remaining_kwargs)
+    end
+end
+
+
+module Profile
+
+using ..CUDA
+
+using ..CUPTI
+
+using PrettyTables
+using DataFrames
+using Statistics
+using Crayons
+using Printf
+
+
+#
+# external profiler
+#
+
+function emit_external_profile(code, kwargs)
+    isempty(kwargs) || throw(ArgumentError("External profiler does not support keyword arguments"))
+
+    quote
+        # wait for the device to become idle (and trigger a GC to avoid interference)
+        CUDA.cuCtxSynchronize()
+        GC.gc(false)
+        GC.gc(true)
+
+        $Profile.start()
+        try
+            $(esc(code))
+        finally
+            $Profile.stop()
+        end
+    end
+end
+
+function find_nsys()
+    if haskey(ENV, "JULIA_CUDA_NSYS")
+        return ENV["JULIA_CUDA_NSYS"]
+    elseif haskey(ENV, "_") && contains(ENV["_"], r"nsys"i)
+        # NOTE: if running as e.g. Jupyter -> nsys -> Julia, _ is `jupyter`
+        return ENV["_"]
+    else
+        # look at a couple of environment variables that may point to NSight
+        nsight = nothing
+        for var in ("LD_PRELOAD", "CUDA_INJECTION64_PATH", "NVTX_INJECTION64_PATH")
+            haskey(ENV, var) || continue
+            for val in split(ENV[var], Sys.iswindows() ? ';' : ':')
+                isfile(val) || continue
+                candidate = if Sys.iswindows()
+                    joinpath(dirname(val), "nsys.exe")
+                else
+                    joinpath(dirname(val), "nsys")
+                end
+                isfile(candidate) && return candidate
+            end
+        end
+    end
+    error("Running under Nsight Systems, but could not find the `nsys` binary to start the profiler. Please specify using JULIA_CUDA_NSYS=path/to/nsys, and file an issue with the contents of ENV.")
+end
+
+const __nsight = Ref{Union{Nothing,String}}()
+function nsight()
+    if !isassigned(__nsight)
+        # find the active Nsight Systems profiler
+        if haskey(ENV, "NSYS_PROFILING_SESSION_ID") && ccall(:jl_generating_output, Cint, ()) == 0
+            __nsight[] = find_nsys()
+            @assert isfile(__nsight[])
+            @info "Running under Nsight Systems, CUDA.@profile will automatically start the profiler"
+        else
+            __nsight[] = nothing
+        end
+    end
+
+    __nsight[]
+end
+
+
+"""
+    start()
+
+Enables profile collection by the active profiling tool for the current context. If
+profiling is already enabled, then this call has no effect.
+"""
+function start()
+    if nsight() !== nothing
+        run(`$(nsight()) start --capture-range=cudaProfilerApi`)
+        # it takes a while for the profiler to actually start tracing our process
+        sleep(0.01)
+    end
+    CUDA.cuProfilerStart()
+end
+
+"""
+    stop()
+
+Disables profile collection by the active profiling tool for the current context. If
+profiling is already disabled, then this call has no effect.
+"""
+function stop()
+    CUDA.cuProfilerStop()
+    if nsight() !== nothing
+        @info """Profiling has finished, open the report listed above with `nsys-ui`
+                 If no report was generated, try launching `nsys` with `--trace=cuda`"""
+    end
+end
+
+
+#
+# native profiler
+#
+
+function emit_native_profile(code, kwargs)
+    activity_kinds = [
+        # API calls
+        CUPTI.CUPTI_ACTIVITY_KIND_DRIVER,
+        CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME,
+        # kernel execution
+        CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
+        CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API,
+        # memory operations
+        CUPTI.CUPTI_ACTIVITY_KIND_MEMCPY,
+        CUPTI.CUPTI_ACTIVITY_KIND_MEMSET,
+        # additional information for API host calls
+        CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2,
+    ]
+
+    quote
+        cfg = CUPTI.ActivityConfig($activity_kinds)
+
+        # wait for the device to become idle (and trigger a GC to avoid interference)
+        CUDA.cuCtxSynchronize()
+        GC.gc(false)
+        GC.gc(true)
+
+        CUPTI.enable!(cfg)
+
+        # sink the initial profiler overhead into a synchronization call
+        CUDA.cuCtxSynchronize()
+        try
+            rv = $(esc(code))
+
+            # synchronize to ensure we capture all activity
+            CUDA.cuCtxSynchronize()
+
+            rv
+        finally
+            CUPTI.disable!(cfg)
+            # convert CUPTI activity records to host and device traces
+            traces = $Profile.generate_traces(cfg)
+            $Profile.render_traces(traces...; $(map(esc, kwargs)...))
+        end
+    end
+end
+
+# convert CUPTI activity records to host and device traces
+function generate_traces(cfg)
+    host_trace = DataFrame(
+        id      = Int[],
+        start   = Float64[],
+        stop    = Float64[],
+        name    = String[],
+
+        tid    = Int[],
+    )
+    device_trace = DataFrame(
+        id      = Int[],
+        start   = Float64[],
+        stop    = Float64[],
+        name    = String[],
+
+        device  = Int[],
+        context = Int[],
+        stream  = Int[],
+
+        # kernel launches
+        grid            = Union{Missing,CUDA.CuDim3}[],
+        block           = Union{Missing,CUDA.CuDim3}[],
+        registers       = Union{Missing,Int64}[],
+        static_shmem    = Union{Missing,Int64}[],
+        dynamic_shmem   = Union{Missing,Int64}[],
+
+        # memory operations
+        size        = Union{Missing,Int64}[],
+    )
+    details = DataFrame(
+        id      = Int[],
+        details = String[],
+    )
+
+    # memory_kind fields are sometimes typed CUpti_ActivityMemoryKind, sometimes UInt
+    as_memory_kind(x) = isa(x, CUPTI.CUpti_ActivityMemoryKind) ? x : CUPTI.CUpti_ActivityMemoryKind(x)
+
+    cuda_version = CUDA.runtime_version()
+    CUPTI.process(cfg) do ctx, stream_id, record
+        # driver API calls
+        if record.kind in [CUPTI.CUPTI_ACTIVITY_KIND_DRIVER,
+                           CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME,
+                           CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API]
+            id = record.correlationId
+            t0, t1 = record.start/1e9, record._end/1e9
+
+            name = if record.kind == CUPTI.CUPTI_ACTIVITY_KIND_DRIVER
+                ref = Ref{Cstring}(C_NULL)
+                CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_DRIVER_API,
+                                            record.cbid, ref)
+                unsafe_string(ref[])
+            elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME
+                ref = Ref{Cstring}(C_NULL)
+                CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_RUNTIME_API,
+                                            record.cbid, ref)
+                unsafe_string(ref[])
+            else
+                "<unknown>"
+            end
+
+            push!(host_trace, (; id, start=t0, stop=t1, name,
+                                 tid=record.threadId))
+
+        # memory operations
+        elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMCPY
+            id = record.correlationId
+            t0, t1 = record.start/1e9, record._end/1e9
+
+            src_kind = as_memory_kind(record.srcKind)
+            dst_kind = as_memory_kind(record.dstKind)
+            name = "[copy $(string(src_kind)) to $(string(dst_kind)) memory]"
+
+            push!(device_trace, (; id, start=t0, stop=t1, name,
+                                   device=record.deviceId,
+                                   context=record.contextId,
+                                   stream=record.streamId,
+                                   size=record.bytes); cols=:union)
+        elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMSET
+            id = record.correlationId
+            t0, t1 = record.start/1e9, record._end/1e9
+
+            memory_kind = as_memory_kind(record.memoryKind)
+            name = "[set $(string(memory_kind)) memory]"
+
+            push!(device_trace, (; id, start=t0, stop=t1, name,
+                                   device=record.deviceId,
+                                   context=record.contextId,
+                                   stream=record.streamId,
+                                   size=record.bytes); cols=:union)
+
+        # memory allocations
+        elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2 && cuda_version >= v"11.2"
+            # XXX: we'd prefer to postpone processing (i.e. calling format_bytes),
+            #      but cannot realistically add a column for every API call
+
+            id = record.correlationId
+
+            memory_kind = as_memory_kind(record.memoryKind)
+            str = "$(Base.format_bytes(record.bytes)), $(string(memory_kind)) memory"
+
+            push!(details, (id, str))
+
+        # kernel execution
+        # TODO: CUPTI_ACTIVITY_KIND_CDP_KERNEL (CUpti_ActivityCdpKernel)
+        elseif record.kind in [CUPTI.CUPTI_ACTIVITY_KIND_KERNEL,
+                               CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL]
+            id = record.correlationId
+            t0, t1 = record.start/1e9, record._end/1e9
+
+            name = unsafe_string(record.name)
+            grid = CUDA.CuDim3(record.gridX, record.gridY, record.gridZ)
+            block = CUDA.CuDim3(record.blockX, record.blockY, record.blockZ)
+            registers = record.registersPerThread
+            static_shmem = record.staticSharedMemory
+            dynamic_shmem = record.dynamicSharedMemory
+
+            push!(device_trace, (; id, start=t0, stop=t1, name,
+                                   device=record.deviceId,
+                                   context=record.contextId,
+                                   stream=record.streamId,
+                                   grid, block, registers,
+                                   static_shmem, dynamic_shmem); cols=:union)
+        else
+            @warn "Unsupported CUPTI activity record kind: $(record.kind)"
+        end
+    end
+
+    return host_trace, device_trace, details
+end
+
+# render traces to a table
+function render_traces(host_trace, device_trace, details;
+                       io=stdout isa Base.TTY ? IOContext(stdout, :limit => true) : stdout,
+                       host=true, device=true, trace=false, raw=false)
+    # find the relevant part of the trace (marked by calls to 'cuCtxSynchronize')
+    trace_first_sync = findfirst(host_trace.name .== "cuCtxSynchronize")
+    trace_first_sync === nothing && error("Could not find the start of the profiling trace.")
+    trace_last_sync = findlast(host_trace.name .== "cuCtxSynchronize")
+    trace_first_sync == trace_last_sync && error("Could not find the end of the profiling trace.")
+    ## truncate the trace
+    if !raw
+        # exclusive bounds
+        trace_begin = host_trace.stop[trace_first_sync]
+        trace_end = host_trace.start[trace_last_sync]
+
+        trace_first_call = copy(host_trace[trace_first_sync+1, :])
+        trace_last_call = copy(host_trace[trace_last_sync-1, :])
+        for df in (host_trace, device_trace)
+            filter!(row -> trace_first_call.id <= row.id <= trace_last_call.id, df)
+        end
+        body_hlines = Int[]
+    else
+        # in raw mode, we display the entire trace, but highlight the relevant part
+        body_hlines = [trace_first_sync, trace_last_sync-1]
+
+        # inclusive bounds
+        trace_begin = host_trace.start[begin]
+        trace_end = host_trace.stop[end]
+    end
+    trace_time = trace_end - trace_begin
+
+    # compute event and trace duration
+    for df in (host_trace, device_trace)
+        df.time = df.stop .- df.start
+    end
+    events = nrow(host_trace) + nrow(device_trace)
+    println(io, "Profiler ran for $(format_time(trace_time)), capturing $(events) events.")
+
+    # make some numbers more friendly to read
+    ## make timestamps relative to the start
+    for df in (host_trace, device_trace)
+        df.start .-= trace_begin
+        df.stop .-= trace_begin
+    end
+    if !raw
+        # renumber event IDs from 1
+        first_id = minimum([host_trace.id; device_trace.id; details.id])
+        for df in (host_trace, device_trace, details)
+            df.id .-= first_id - 1
+        end
+
+        # renumber thread IDs from 1
+        first_tid = minimum(host_trace.tid)
+        host_trace.tid .-= first_tid - 1
+
+    end
+
+    # helper function to visualize slow trace entries
+    function time_highlighters(df)
+        ## filter out entries that execute _very_ quickly (like calls to cuCtxGetCurrent)
+        relevant_times = df[df.time .>= 1e-8, :time]
+
+        p75 = quantile(relevant_times, 0.75)
+        p95 = quantile(relevant_times, 0.95)
+
+        highlight_p95 = Highlighter((data, i, j) -> (names(data)[j] == "time") &&
+                                                    (data[i,j] >= p95),
+                                    crayon"red")
+        highlight_p75 = Highlighter((data, i, j) -> (names(data)[j] == "time") &&
+                                                    (data[i,j] >= p75),
+                                    crayon"yellow")
+        highlight_bold = Highlighter((data, i, j) -> (names(data)[j] == "name") &&
+                                                     (data[!, :time][i] >= p75),
+                                    crayon"bold")
+
+        (highlight_p95, highlight_p75, highlight_bold)
+    end
+
+    function summarize_trace(df)
+        df = groupby(df, :name)
+
+        # gather summary statistics
+        df = combine(df,
+            :time => sum => :time,
+            :time => length => :calls,
+            :time => mean => :time_avg,
+            :time => minimum => :time_min,
+            :time => maximum => :time_max
+        )
+        df.time_ratio = df.time ./ trace_time
+        sort!(df, :time_ratio, rev=true)
+
+        return df
+    end
+
+    trace_column_names = Dict(
+        "id"            => "ID",
+        "start"         => "Start",
+        "time"          => "Time",
+        "grid"          => "Blocks",
+        "tid"           => "Thread",
+        "block"         => "Threads",
+        "registers"     => "Regs",
+        "static_shmem"  => "SSMem",
+        "dynamic_shmem" => "DSMem",
+        "size"          => "Size",
+        "throughput"    => "Throughput",
+        "device"        => "Device",
+        "stream"        => "Stream",
+        "name"          => "Name",
+        "details"       => "Details"
+    )
+
+    summary_column_names = Dict(
+        "time"          => "Time",
+        "time_ratio"    => "Time (%)",
+        "calls"         => "Calls",
+        "time_avg"      => "Avg time",
+        "time_min"      => "Min time",
+        "time_max"      => "Max time",
+        "name"          => "Name"
+    )
+
+    summary_formatter = function(v, i, j)
+        if names(df)[j] == "time_ratio"
+            format_percentage(v)
+        elseif names(df)[j] in ["time", "time_avg", "time_min", "time_max"]
+            format_time(v)
+        else
+            v
+        end
+    end
+
+    crop = if io isa IOBuffer
+        # when emitting to a string, render all content (e.g., for the tests)
+        :none
+    else
+        :horizontal
+    end
+
+    if host
+        # to determine the time the host was active, we should look at threads separately
+        host_time = maximum(combine(groupby(host_trace, :tid), :time => sum => :time).time)
+        host_ratio = host_time / trace_time
+        println(io, "\nHost-side activity: calling CUDA APIs took $(format_time(host_time)) ($(format_percentage(host_ratio)) of the trace)")
+
+        # get rid of API call version suffixes
+        host_trace.name = replace.(host_trace.name, r"_v\d+$" => "")
+
+        df = if raw
+            host_trace
+        else
+            # filter spammy API calls
+            filter(host_trace) do row
+                !in(row.name, [# context and stream queries we use for nonblocking sync
+                               "cuCtxGetCurrent", "cuStreamQuery",
+                               # occupancy API, done before every kernel launch
+                               "cuOccupancyMaxPotentialBlockSize",
+                               # driver pointer set-up
+                               "cuGetProcAddress",
+                               # called a lot during compilation
+                               "cuDeviceGetAttribute",
+                               # pointer attribute query, done before every memory operation
+                               "cuPointerGetAttribute"])
+            end
+        end
+
+        # add in details
+        df = leftjoin(df, details, on=:id, order=:left)
+
+        if isempty(df)
+            println(io, "No host-side activity was recorded.")
+        elseif trace
+            # determine columns to show, based on whether they contain useful information
+            columns = [:id, :start, :time]
+            for col in [:tid]
+                if raw || length(unique(df[!, col])) > 1
+                    push!(columns, col)
+                end
+            end
+            push!(columns, :name)
+            if any(!ismissing, df.details)
+                push!(columns, :details)
+            end
+
+            df = df[:, columns]
+            header = [trace_column_names[name] for name in names(df)]
+
+            alignment = [i == lastindex(header) ? :l : :r for i in 1:length(header)]
+            formatters = function(v, i, j)
+                if v === missing
+                    return "-"
+                elseif names(df)[j] in ["start", "time"]
+                    format_time(v)
+                else
+                    v
+                end
+            end
+            highlighters = time_highlighters(df)
+            pretty_table(io, df; header, alignment, formatters, highlighters, crop, body_hlines)
+        else
+            df = summarize_trace(df)
+
+            columns = [:time_ratio, :time, :calls, :time_avg, :time_min, :time_max, :name]
+            df = df[:, columns]
+
+            header = [summary_column_names[name] for name in names(df)]
+            alignment = [i == lastindex(header) ? :l : :r for i in 1:length(header)]
+            highlighters = time_highlighters(df)
+            pretty_table(io, df; header, alignment, formatters=summary_formatter, highlighters, crop)
+        end
+    end
+
+    if device
+        device_time = sum(device_trace.time)
+        device_ratio = device_time / trace_time
+        println(io, "\nDevice-side activity: GPU was busy for $(format_time(device_time)) ($(format_percentage(device_ratio)) of the trace)")
+
+        # add memory throughput information
+        device_trace.throughput = device_trace.size ./ device_trace.time
+
+        if isempty(device_trace)
+            println(io, "No device-side activity was recorded.")
+        elseif trace
+            # determine columns to show, based on whether they contain useful information
+            columns = [:id, :start, :time]
+            ## device/stream identification
+            for col in [:device, :stream]
+                if raw || length(unique(device_trace[!, col])) > 1
+                    push!(columns, col)
+                end
+            end
+            ## kernel details (can be missing)
+            for col in [:block, :grid, :registers]
+                if raw || any(!ismissing, device_trace[!, col])
+                    push!(columns, col)
+                end
+            end
+            for col in [:static_shmem, :dynamic_shmem]
+                if raw || any(val->!ismissing(val) && val > 0, device_trace[!, col])
+                    push!(columns, col)
+                end
+            end
+            ## memory details (can be missing)
+            if raw || any(!ismissing, device_trace.size)
+                push!(columns, :size)
+                push!(columns, :throughput)
+            end
+            push!(columns, :name)
+
+            df = device_trace[:, columns]
+            header = [trace_column_names[name] for name in names(df)]
+
+            alignment = [i == lastindex(header) ? :l : :r for i in 1:length(header)]
+            formatters = function(v, i, j)
+                if v === missing
+                    return "-"
+                elseif names(df)[j] in ["start", "time"]
+                    format_time(v)
+                elseif names(df)[j] in ["static_shmem", "dynamic_shmem", "size"]
+                    Base.format_bytes(v)
+                elseif names(df)[j] in ["throughput"]
+                    Base.format_bytes(v) * "/s"
+                elseif names(df)[j] in ["device"]
+                    CUDA.name(CuDevice(v))
+                elseif v isa CUDA.CuDim3
+                    if v.z != 1
+                        "$(Int(v.x))×$(Int(v.y))×$(Int(v.z))"
+                    elseif v.y != 1
+                        "$(Int(v.x))×$(Int(v.y))"
+                    else
+                        "$(Int(v.x))"
+                    end
+                else
+                    v
+                end
+            end
+            highlighters = time_highlighters(df)
+            pretty_table(io, df; header, alignment, formatters, highlighters, crop, body_hlines)
+        else
+            df = summarize_trace(device_trace)
+
+            columns = [:time_ratio, :time, :calls, :time_avg, :time_min, :time_max, :name]
+            df = df[:, columns]
+
+            header = [summary_column_names[name] for name in names(df)]
+            alignment = [i == lastindex(header) ? :l : :r for i in 1:length(header)]
+            highlighters = time_highlighters(df)
+            pretty_table(io, df; header, alignment, formatters=summary_formatter, highlighters, crop)
+        end
+    end
+end
+
+format_percentage(x::Number) = @sprintf("%.2f%%", x * 100)
+
+function format_time(t::Number)
+    io = IOBuffer()
+    if abs(t) < 1e-6  # less than 1 microsecond
+        print(io, round(t * 1e9, digits=2), " ns")
+    elseif abs(t) < 1e-3  # less than 1 millisecond
+        print(io, round(t * 1e6, digits=2), " µs")
+    elseif abs(t) < 1  # less than 1 second
+        print(io, round(t * 1e3, digits=2), " ms")
+    else
+        print(io, round(t, digits=2), " s")
+    end
+    return String(take!(io))
+end
+
+end
diff --git a/test/base/aqua.jl b/test/base/aqua.jl
index 8ec04bf42b..1a0e6391fe 100644
--- a/test/base/aqua.jl
+++ b/test/base/aqua.jl
@@ -9,7 +9,7 @@ let ambs = Aqua.detect_ambiguities(CUDA; recursive=true)
     pkg_match(pkgname, pkgdir::Nothing) = false
     pkg_match(pkgname, pkgdir::AbstractString) = occursin(pkgname, pkgdir)
     filter!(x -> pkg_match("CUDA", pkgdir(last(x).module)), ambs)
-    @test length(ambs) ≤ 35
+    @test length(ambs) ≤ 50
 end
 
 Aqua.test_undefined_exports(CUDA)
diff --git a/test/core/cudadrv.jl b/test/core/cudadrv.jl
index 9fe9da27e9..5727fa98d5 100644
--- a/test/core/cudadrv.jl
+++ b/test/core/cudadrv.jl
@@ -824,17 +824,6 @@ end
 
 ############################################################################################
 
-@testset "profile" begin
-
-@test_logs (:warn, r"only informs an external profiler to start") CUDA.Profile.start()
-CUDA.Profile.stop()
-
-@test_logs (:warn, r"only informs an external profiler to start") CUDA.@profile begin end
-
-end
-
-############################################################################################
-
 @testset "stream" begin
 
 s = CuStream()
diff --git a/test/core/profile.jl b/test/core/profile.jl
new file mode 100644
index 0000000000..0e7219e6a5
--- /dev/null
+++ b/test/core/profile.jl
@@ -0,0 +1,81 @@
+@testset "profiler" begin
+
+@testset "external" begin
+
+CUDA.Profile.start()
+CUDA.Profile.stop()
+
+@test CUDA.@profile external=true begin
+    true
+end
+
+end
+
+############################################################################################
+
+VERSION >= v"1.9" && @testset "native" begin
+
+# smoke test
+let
+    str = sprint() do io
+        rv = CUDA.@profile io=io begin
+            true
+        end
+        @test rv
+    end
+
+    @test occursin("No host-side activity was recorded", str)
+    @test occursin("No device-side activity was recorded", str)
+end
+
+# kernel launch
+let
+    @cuda identity(nothing)
+    str = sprint() do io
+        CUDA.@profile io=io begin
+            @cuda identity(nothing)
+        end
+    end
+
+    @test occursin("cuLaunchKernel", str)
+    @test occursin("_Z8identityv", str)
+
+    @test !occursin("cuCtxSynchronize", str)
+    @test !occursin("ID", str)
+end
+
+# kernel launch (trace)
+let
+    str = sprint() do io
+        CUDA.@profile io=io trace=true begin
+            @cuda identity(nothing)
+        end
+    end
+
+    @test occursin("cuLaunchKernel", str)
+    @test occursin("_Z8identityv", str)
+
+    @test occursin("ID", str)
+
+    @test !occursin("cuCtxSynchronize", str)
+end
+
+# kernel launch (raw trace)
+let
+    str = sprint() do io
+        CUDA.@profile io=io trace=true raw=true begin
+            @cuda identity(nothing)
+        end
+    end
+
+    @test occursin("cuLaunchKernel", str)
+    @test occursin("_Z8identityv", str)
+
+    @test occursin("ID", str)
+
+    @test occursin("cuCtxSynchronize", str)
+end
+
+end
+
+end

From 5ed63a025a24e0b7eb24d9ec90bd9e8ae1aaeef3 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 9 Aug 2023 22:43:49 +0200
Subject: [PATCH 02/12] Bump CUDA testing to Julia 1.9.

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8641bcae5e..7c6edf4c6c 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -51,7 +51,7 @@ steps:
       - label: "CUDA {{matrix.cuda}}"
         plugins:
           - JuliaCI/julia#v1:
-              version: 1.8
+              version: 1.9
           - JuliaCI/julia-test#v1:
               test_args: "core base libraries"
           - JuliaCI/julia-coverage#v1:

From abeb87296937af821a3ed48f6019b753e9708af8 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 10 Aug 2023 13:55:18 +0200
Subject: [PATCH 03/12] Fix trace time calculation.

---
 src/profile.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/profile.jl b/src/profile.jl
index 22f7be03b1..5142b3169b 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -357,9 +357,8 @@ function render_traces(host_trace, device_trace, details;
     trace_first_sync == trace_last_sync && error("Could not find the end of the profiling trace.")
     ## truncate the trace
     if !raw
-        # exclusive bounds
         trace_begin = host_trace.stop[trace_first_sync]
-        trace_end = host_trace.start[trace_last_sync]
+        trace_end = host_trace.stop[trace_last_sync]
 
         trace_first_call = copy(host_trace[trace_first_sync+1, :])
         trace_last_call = copy(host_trace[trace_last_sync-1, :])

From 3ad9ea2e636f6f1c90a6acf1f9448b634c62f98d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 10 Aug 2023 13:55:28 +0200
Subject: [PATCH 04/12] Update documentation.

---
 docs/Manifest.toml                |  49 ++++++---
 docs/src/development/profiling.md | 177 +++++++++++++++++-------------
 docs/src/usage/array.md           |   2 +-
 3 files changed, 134 insertions(+), 94 deletions(-)

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index 9e2c5c98d6..ca0c3aac2c 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -14,21 +14,21 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
 [[DocStringExtensions]]
 deps = ["LibGit2"]
-git-tree-sha1 = "5158c2b41018c5f7eb1470d558127ac274eca0c9"
+git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
 uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.9.1"
+version = "0.9.3"
 
 [[Documenter]]
 deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
-git-tree-sha1 = "6030186b00a38e9d0434518627426570aac2ef95"
+git-tree-sha1 = "39fd748a73dce4c05a9655475e437170d8fb1b67"
 uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.27.23"
+version = "0.27.25"
 
 [[IOCapture]]
 deps = ["Logging", "Random"]
-git-tree-sha1 = "f7be53659ab06ddc986428d3a9dcc95f6fa6705a"
+git-tree-sha1 = "d75853a0bdbfb1ac815478bacd89cd27b550ace6"
 uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
-version = "0.2.2"
+version = "0.2.3"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
@@ -36,9 +36,9 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[JSON]]
 deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e"
+git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a"
 uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.3"
+version = "0.21.4"
 
 [[LibGit2]]
 deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
@@ -46,9 +46,9 @@ uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[Literate]]
 deps = ["Base64", "IOCapture", "JSON", "REPL"]
-git-tree-sha1 = "1c4418beaa6664041e0f9b48f0710f57bff2fcbe"
+git-tree-sha1 = "a1a0d4ff9f785a2184baca7a5c89e664f144143d"
 uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
-version = "2.14.0"
+version = "2.14.1"
 
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -65,10 +65,22 @@ uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
 version = "1.2.0"
 
 [[Parsers]]
-deps = ["Dates"]
-git-tree-sha1 = "6c01a9b494f6d2a9fc180a08b182fcb06f0958a0"
+deps = ["Dates", "PrecompileTools", "UUIDs"]
+git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.4.2"
+version = "2.7.2"
+
+[[PrecompileTools]]
+deps = ["Preferences"]
+git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81"
+uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+version = "1.1.2"
+
+[[Preferences]]
+deps = ["TOML"]
+git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1"
+uuid = "21216c6a-2e73-6563-6e65-726566657250"
+version = "1.4.0"
 
 [[Printf]]
 deps = ["Unicode"]
@@ -79,7 +91,7 @@ deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 
 [[Random]]
-deps = ["SHA", "Serialization"]
+deps = ["SHA"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [[SHA]]
@@ -92,9 +104,18 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
+[[TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+version = "1.0.3"
+
 [[Test]]
 deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
diff --git a/docs/src/development/profiling.md b/docs/src/development/profiling.md
index 8f25b1092c..5d43632939 100644
--- a/docs/src/development/profiling.md
+++ b/docs/src/development/profiling.md
@@ -1,17 +1,18 @@
-# Profiling
+# Benchmarking & profiling
 
-Profiling GPU code is harder than profiling Julia code executing on the CPU. For one,
-kernels typically execute asynchronously, and thus require appropriate synchronization when
-measuring their execution time. Furthermore, because the code executes on a different
-processor, it is much harder to know what is currently executing. CUDA, and the Julia CUDA
-packages, provide several tools and APIs to remedy this.
+Benchmarking and profiling a GPU program is harder than doing the same for a program
+executing on the CPU. For one, GPU operations typically execute asynchronously, and thus
+require appropriate synchronization when measuring their execution time. Furthermore,
+because the program executes on a different processor, it is much harder to know what is
+currently executing. CUDA, and the Julia CUDA packages, provide several tools and APIs to
+remedy this.
 
 
 ## Time measurements
 
-To accurately measure execution time in the presence of asynchronously-executing kernels,
-CUDA.jl provides an `@elapsed` macro that, much like `Base.@elapsed`, measures the total
-execution time of a block of code on the GPU:
+To accurately measure execution time in the presence of asynchronously-executing GPU
+operations, CUDA.jl provides an `@elapsed` macro that, much like `Base.@elapsed`, measures
+the total execution time of a block of code on the GPU:
 
 ```julia
 julia> a = CUDA.rand(1024,1024,1024);
@@ -40,8 +41,8 @@ julia> CUDA.@time sin.(a);
   0.046063 seconds (96 CPU allocations: 3.750 KiB) (1 GPU allocation: 4.000 GiB, 14.33% gc time of which 99.89% spent allocating)
 ```
 
-The `@time` macro is more user-friendly and is a generally more useful tool when measuring
-the end-to-end performance characteristics of a GPU application.
+The `CUDA.@time` macro is more user-friendly and is a generally more useful tool when
+measuring the end-to-end performance characteristics of a GPU application.
 
 For robust measurements however, it is advised to use the
 [BenchmarkTools.jl](https://github.com/JuliaCI/BenchmarkTools.jl) package which goes to
@@ -76,80 +77,92 @@ For profiling large applications, simple timings are insufficient. Instead, we w
 overview of how and when the GPU was active, to avoid times where the device was idle and/or
 find which kernels needs optimization.
 
-As we cannot use the Julia profiler for this task, we will be using external profiling
-software as part of the CUDA toolkit. To inform those external tools which code needs to be
-profiled (e.g., to exclude warm-up iterations or other noninteresting elements) you can use
-the `CUDA.@profile` macro to surround interesting code with. Again, this macro mimics an
-equivalent from the standard library, but this time requires external software to actually
-perform the profiling:
+### Integrated profiler
+
+Once again, we cannot use CPU utilities to profile GPU programs, as they will only paint a
+partial picture. Instead, CUDA.jl provides a `CUDA.@profile` macro that separately reports
+the time spent on the CPU, and the time spent on the GPU:
 
 ```julia
 julia> a = CUDA.rand(1024,1024,1024);
 
-julia> sin.(a);  # warmup
-
 julia> CUDA.@profile sin.(a);
-┌ Warning: Calling CUDA.@profile only informs an external profiler to start.
-│ The user is responsible for launching Julia under a CUDA profiler like `nvprof`.
-└ @ CUDA.Profile ~/Julia/pkg/CUDA/src/profile.jl:42
+Profiler ran for 11.93 ms, capturing 8 events.
+
+Host-side activity: calling CUDA APIs took 437.26 µs (3.67% of the trace)
+┌──────────┬───────────┬───────┬───────────┬───────────┬───────────┬─────────────────┐
+│ Time (%) │      Time │ Calls │  Avg time │  Min time │  Max time │ Name            │
+├──────────┼───────────┼───────┼───────────┼───────────┼───────────┼─────────────────┤
+│    3.56% │ 424.15 µs │     1 │ 424.15 µs │ 424.15 µs │ 424.15 µs │ cuLaunchKernel  │
+│    0.10% │  11.92 µs │     1 │  11.92 µs │  11.92 µs │  11.92 µs │ cuMemAllocAsync │
+└──────────┴───────────┴───────┴───────────┴───────────┴───────────┴─────────────────┘
+
+Device-side activity: GPU was busy for 11.48 ms (96.20% of the trace)
+┌──────────┬──────────┬───────┬──────────┬──────────┬──────────┬───────────────────────
+│ Time (%) │     Time │ Calls │ Avg time │ Min time │ Max time │ Name                 ⋯
+├──────────┼──────────┼───────┼──────────┼──────────┼──────────┼───────────────────────
+│   96.20% │ 11.48 ms │     1 │ 11.48 ms │ 11.48 ms │ 11.48 ms │ _Z16broadcast_kernel ⋯
+└──────────┴──────────┴───────┴──────────┴──────────┴──────────┴───────────────────────
 ```
 
-### `nvprof` and `nvvp`
+By default, `CUDA.@profile` will provide a summary of host and device activities. If you
+prefer a chronological view of the events, you can set the `trace` keyword argument:
 
-!!! warning
-
-    These tools are deprecated, and will be removed from future versions of CUDA.
-    Prefer to use the Nsight tools described below.
+```julia
+julia> CUDA.@profile trace=true sin.(a);
+Profiler ran for 11.71 ms, capturing 8 events.
+
+Host-side activity: calling CUDA APIs took 217.68 µs (1.86% of the trace)
+┌────┬──────────┬───────────┬─────────────────┬──────────────────────────┐
+│ ID │    Start │      Time │            Name │ Details                  │
+├────┼──────────┼───────────┼─────────────────┼──────────────────────────┤
+│  2 │  7.39 µs │  14.07 µs │ cuMemAllocAsync │ 4.000 GiB, device memory │
+│  6 │ 29.56 µs │ 202.42 µs │  cuLaunchKernel │ -                        │
+└────┴──────────┴───────────┴─────────────────┴──────────────────────────┘
+
+Device-side activity: GPU was busy for 11.48 ms (98.01% of the trace)
+┌────┬──────────┬──────────┬─────────┬────────┬──────┬─────────────────────────────────
+│ ID │    Start │     Time │ Threads │ Blocks │ Regs │ Name                           ⋯
+├────┼──────────┼──────────┼─────────┼────────┼──────┼─────────────────────────────────
+│  6 │ 229.6 µs │ 11.48 ms │     768 │    284 │   34 │ _Z16broadcast_kernel15CuKernel ⋯
+└────┴──────────┴──────────┴─────────┴────────┴──────┴─────────────────────────────────
+```
 
-For simple profiling, prefix your Julia command-line invocation with the `nvprof` utility.
-For a better timeline, be sure to use `CUDA.@profile` to delimit interesting code and
-start `nvprof` with the option `--profile-from-start off`:
+Here, every call is prefixed with an ID, which can be used to correlate host and device
+events. For example, here we can see that the host-side `cuLaunchKernel` call with ID 6
+corresponds to the device-side `broadcast` kernel.
 
-```
-$ nvprof --profile-from-start off julia
+### External profilers
 
-julia> using CUDA
+If you want more details, or a graphical representation, we recommend using external
+profilers. To inform those external tools which code needs to be profiled (e.g., to exclude
+warm-up iterations or other noninteresting elements) you can use `CUDA.@profile` with the
+`external=true` keyword argument to surround interesting code with:
 
+```julia
 julia> a = CUDA.rand(1024,1024,1024);
 
-julia> sin.(a);
+julia> sin.(a);  # warmup
 
-julia> CUDA.@profile sin.(a);
+julia> CUDA.@profile external=true sin.(a);
 
-julia> exit()
-==156406== Profiling application: julia
-==156406== Profiling result:
-            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
- GPU activities:  100.00%  44.777ms         1  44.777ms  44.777ms  44.777ms  ptxcall_broadcast_1
-      API calls:   56.46%  6.6544ms         1  6.6544ms  6.6544ms  6.6544ms  cuMemAlloc
-                   43.52%  5.1286ms         1  5.1286ms  5.1286ms  5.1286ms  cuLaunchKernel
-                    0.01%  1.3200us         1  1.3200us  1.3200us  1.3200us  cuDeviceGetCount
-                    0.01%     725ns         3     241ns     196ns     301ns  cuCtxGetCurrent
+julia>
 ```
 
-!!! info
+This only activates an external profiler, and does not perform any profiling itself. NVIDIA
+provides two tools for profiling CUDA applications: NSight Systems and NSight Compute for
+respectively timeline profiling and more detailed kernel analysis. Both tools are
+well-integrated with the Julia GPU packages, and make it possible to iteratively profile
+without having to restart Julia.
 
-    If `nvprof` crashes, reporting that the application returned non-zero code 12,
-    try starting `nvprof` with `--openacc-profiling off`.
+#### NVIDIA Nsight Systems
 
-For a visual overview of these results, you can use the NVIDIA Visual Profiler (`nvvp`):
-
-!["NVIDIA Visual Profiler"](nvvp.png)
-
-Note however that both `nvprof` and `nvvp` are deprecated, and will be removed from future
-versions of the CUDA toolkit.
-
-### NVIDIA Nsight Systems
-
-Following the deprecation of above tools, NVIDIA published the Nsight Systems and Nsight
-Compute tools for respectively timeline profiling and more detailed kernel analysis. The
-former is well-integrated with the Julia GPU packages, and makes it possible to iteratively
-profile without having to restart Julia as was the case with `nvvp` and `nvprof`.
-
-After downloading and installing NSight Systems (a version might have been installed
-alongside with the CUDA toolkit, but it is recommended to download and install the latest
-version from the NVIDIA website), you need to launch Julia from the command-line, wrapped by
-the `nsys` utility from NSight Systems:
+Generally speaking, the first external profiler you should use is NSight Systems, as it will
+give you a high-level overview of your application's performance characteristics. After
+downloading and installing the tool (a version might have been installed alongside with the
+CUDA toolkit, but it is recommended to download and install the latest version from the
+NVIDIA website), you need to launch Julia from the command-line, wrapped by the `nsys`
+utility from NSight Systems:
 
 ```
 $ nsys launch julia
@@ -167,7 +180,7 @@ julia> a = CUDA.rand(1024,1024,1024);
 
 julia> sin.(a);
 
-julia> CUDA.@profile sin.(a);
+julia> CUDA.@profile external=true sin.(a);
 start executed
 Processing events...
 Capturing symbol files...
@@ -192,14 +205,14 @@ You can open the resulting `.qdrep` file with `nsight-sys`:
 !!! info
 
     If NSight Systems does not capture any kernel launch, even though you have used
-    `CUDA.@profile`, try starting `nsys` with `--trace cuda`.
+    `CUDA.@profile external=true`, try starting `nsys` with `--trace cuda`.
 
-### NVIDIA Nsight Compute
+#### NVIDIA Nsight Compute
 
-If you want details on the execution properties of a kernel, or inspect API interactions,
-Nsight Compute is the tool for you. It is again possible to use this profiler with an
-interactive session of Julia, and debug or profile only those sections of your application
-that are marked with `CUDA.@profile`.
+If you want details on the execution properties of a single kernel, or inspect API
+interactions in detail, Nsight Compute is the tool for you. It is again possible to use this
+profiler with an interactive session of Julia, and debug or profile only those sections of
+your application that are marked with `CUDA.@profile`.
 
 Start with launching Julia under the Nsight Compute CLI tool:
 
@@ -234,7 +247,7 @@ julia> a = CUDA.rand(1024,1024,1024);
 
 julia> sin.(a);
 
-julia> CUDA.@profile sin.(a);
+julia> CUDA.@profile external=true sin.(a);
 ```
 
 Once that's finished, the Nsight Compute GUI window will have plenty details on our kernel:
@@ -255,21 +268,27 @@ or expensive CPU operations, you can use the NVTX library via the NVTX.jl packag
 ```julia
 using CUDA, NVTX
 
+NVTX.@mark "reached Y"
+
 NVTX.@range "doing X" begin
     ...
 end
 
-NVTX.@mark "reached Y"
+NVTX.@annotate function foo()
+    ...
+end
 ```
 
+For more details, refer to the documentation of the NVTX.jl package.
+
 
 ## Compiler options
 
-Some tools, like `nvvp` and NSight Systems Compute, also make it possible to do source-level
-profiling. CUDA.jl will by default emit the necessary source line information, which you
-can disable by launching Julia with `-g0`. Conversely, launching with `-g2` will emit
-additional debug information, which can be useful in combination with tools like `cuda-gdb`,
-but might hurt performance or code size.
+Some tools, like NSight Systems Compute, also make it possible to do source-level profiling.
+CUDA.jl will by default emit the necessary source line information, which you can disable by
+launching Julia with `-g0`. Conversely, launching with `-g2` will emit additional debug
+information, which can be useful in combination with tools like `cuda-gdb`, but might hurt
+performance or code size.
 
 !!! warning
 
diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index e1b74bf4f0..1206717fdf 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -381,7 +381,7 @@ julia> a \ b
 
 julia> Array(a) \ Array(b)
 2×2 Matrix{Float32}:
-  1.29018    0.942773
+  1.29018    0.942772
  -0.765663  -0.782648
 ```
 

From 22c4813806a3763bd7901d3493590cb84036e0eb Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 10 Aug 2023 15:36:12 +0200
Subject: [PATCH 05/12] Remove unneeded module prefixes, and tweak error
 messages.

---
 lib/cupti/wrappers.jl | 67 ++++++++++++++++++++++---------------------
 src/profile.jl        |  2 +-
 2 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/lib/cupti/wrappers.jl b/lib/cupti/wrappers.jl
index 5f0e8dd73b..bbda1d3527 100644
--- a/lib/cupti/wrappers.jl
+++ b/lib/cupti/wrappers.jl
@@ -23,7 +23,7 @@ end
 High-level interface to the CUPTI activity API.
 """
 struct ActivityConfig
-    activity_kinds::Vector{CUPTI.CUpti_ActivityKind}
+    activity_kinds::Vector{CUpti_ActivityKind}
 
     available_buffers::Vector{Vector{UInt8}}
     active_buffers::Vector{Vector{UInt8}}
@@ -102,13 +102,13 @@ function enable!(cfg::ActivityConfig)
                                     (Ptr{Ptr{UInt8}}, Ptr{Csize_t}, Ptr{Csize_t}))
     complete_buffer_ptr = @cfunction(complete_buffer, Cvoid,
                                      (CUDA.CUcontext, UInt32, Ptr{UInt8}, Csize_t, Csize_t))
-    CUPTI.cuptiActivityRegisterCallbacks(request_buffer_ptr, complete_buffer_ptr)
+    cuptiActivityRegisterCallbacks(request_buffer_ptr, complete_buffer_ptr)
 
     activity_config[] = cfg
 
     # enable requested activity kinds
     for activity_kind in cfg.activity_kinds
-        CUPTI.cuptiActivityEnable(activity_kind)
+        cuptiActivityEnable(activity_kind)
     end
 end
 
@@ -119,11 +119,11 @@ function disable!(cfg::ActivityConfig)
 
     # disable activity kinds
     for activity_kind in cfg.activity_kinds
-        CUPTI.cuptiActivityDisable(activity_kind)
+        cuptiActivityDisable(activity_kind)
     end
 
     # flush all activity records, even incomplete ones
-    CUPTI.cuptiActivityFlushAll(CUPTI.CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)
+    cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)
 
     activity_config[] = nothing
 
@@ -132,54 +132,54 @@ end
 
 function process(f, cfg::ActivityConfig)
     activity_types = Dict(
-        CUPTI.CUPTI_ACTIVITY_KIND_DRIVER                => CUPTI.CUpti_ActivityAPI,
-        CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME               => CUPTI.CUpti_ActivityAPI,
-        CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API   => CUPTI.CUpti_ActivityAPI,
+        CUPTI_ACTIVITY_KIND_DRIVER              => CUpti_ActivityAPI,
+        CUPTI_ACTIVITY_KIND_RUNTIME             => CUpti_ActivityAPI,
+        CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API => CUpti_ActivityAPI,
     )
     # NOTE: the CUPTI version is unreliable, e.g., both CUDA 11.5 and 11.6 have CUPTI 16,
     #       yet CUpti_ActivityMemset4 is only available in CUDA 11.6.
     cuda_version = CUDA.runtime_version()
     ## kernel activities
-    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_KERNEL] =
+    activity_types[CUPTI_ACTIVITY_KIND_KERNEL] =
         if cuda_version >= v"12.0"
-            CUPTI.CUpti_ActivityKernel5
+            CUpti_ActivityKernel5
         elseif cuda_version >= v"11.8"
-            CUPTI.CUpti_ActivityKernel8
+            CUpti_ActivityKernel8
         elseif cuda_version >= v"11.6"
-            CUPTI.CUpti_ActivityKernel7
+            CUpti_ActivityKernel7
         elseif cuda_version >= v"11.2"
-            CUPTI.CUpti_ActivityKernel6
+            CUpti_ActivityKernel6
         elseif cuda_version >= v"11.1"
-            CUPTI.CUpti_ActivityKernel5
+            CUpti_ActivityKernel5
         else # v"11.0"
-            CUPTI.CUpti_ActivityKernel4
+            CUpti_ActivityKernel4
         end
-    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL] =
-        activity_types[CUPTI.CUPTI_ACTIVITY_KIND_KERNEL]
+    activity_types[CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL] =
+        activity_types[CUPTI_ACTIVITY_KIND_KERNEL]
     ## memcpy activities
-    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_MEMCPY] =
+    activity_types[CUPTI_ACTIVITY_KIND_MEMCPY] =
         if cuda_version >= v"11.6"
-            CUPTI.CUpti_ActivityMemcpy5
+            CUpti_ActivityMemcpy5
         elseif cuda_version >= v"11.1"
-            CUPTI.CUpti_ActivityMemcpy4
+            CUpti_ActivityMemcpy4
         else # v"11.0"
-            CUPTI.CUpti_ActivityMemset3
+            CUpti_ActivityMemset3
         end
-    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_MEMSET] =
+    activity_types[CUPTI_ACTIVITY_KIND_MEMSET] =
         if cuda_version >= v"11.6"
-            CUPTI.CUpti_ActivityMemset4
+            CUpti_ActivityMemset4
         elseif cuda_version >= v"11.1"
-            CUPTI.CUpti_ActivityMemset3
+            CUpti_ActivityMemset3
         else # v"11.0"
-            CUPTI.CUpti_ActivityMemset2
+            CUpti_ActivityMemset2
         end
-    activity_types[CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2] =
+    activity_types[CUPTI_ACTIVITY_KIND_MEMORY2] =
         if cuda_version >= v"11.6"
-            CUPTI.CUpti_ActivityMemory3
+            CUpti_ActivityMemory3
         elseif cuda_version >= v"11.2"
-            CUPTI.CUpti_ActivityMemory2
+            CUpti_ActivityMemory2
         else # v"9.0"
-            CUPTI.CUpti_ActivityMemory
+            CUpti_ActivityMemory
         end
 
     # extract typed activity records
@@ -187,10 +187,10 @@ function process(f, cfg::ActivityConfig)
         ctx = CUDA._CuContext(ctx_handle)
         # XXX: can we reconstruct the stream from the stream ID?
 
-        record_ptr = Ref{Ptr{CUPTI.CUpti_Activity}}(C_NULL)
+        record_ptr = Ref{Ptr{CUpti_Activity}}(C_NULL)
         while true
             try
-                CUPTI.cuptiActivityGetNextRecord(buf_ptr, valid_sz, record_ptr)
+                cuptiActivityGetNextRecord(buf_ptr, valid_sz, record_ptr)
                 record = unsafe_load(record_ptr[])
 
                 if haskey(activity_types, record.kind)
@@ -199,10 +199,11 @@ function process(f, cfg::ActivityConfig)
                     typed_record = unsafe_load(typed_ptr)
                     f(ctx, stream_id, typed_record)
                 else
-                    @warn "Unsupported activity kind: $(record.kind)"
+                    @warn """Unsupported activity kind: $(record.kind).
+                             Please file an issue, or extend the implementation of `CUPTI.process` to handle this activity kind."""
                 end
             catch err
-                if isa(err, CUPTIError) && err.code == CUPTI.CUPTI_ERROR_MAX_LIMIT_REACHED
+                if isa(err, CUPTIError) && err.code == CUPTI_ERROR_MAX_LIMIT_REACHED
                     break
                 end
                 rethrow()
diff --git a/src/profile.jl b/src/profile.jl
index 5142b3169b..2e9276c40d 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -339,7 +339,7 @@ function generate_traces(cfg)
                                    grid, block, registers,
                                    static_shmem, dynamic_shmem); cols=:union)
         else
-            @warn "Unsupported CUPTI activity record kind: $(record.kind)"
+            error("Unexpected CUPTI activity kind: $(record.kind). Please file an issue.")
         end
     end
 

From e9efbd2e9422183e2dde6ce7fbd8dc76b9941366 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 10 Aug 2023 15:38:14 +0200
Subject: [PATCH 06/12] Don't highlight empty traces.

---
 src/profile.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/profile.jl b/src/profile.jl
index 2e9276c40d..a25db061ef 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -407,6 +407,7 @@ function render_traces(host_trace, device_trace, details;
         ## filter out entries that execute _very_ quickly (like calls to cuCtxGetCurrent)
         relevant_times = df[df.time .>= 1e-8, :time]
 
+        isempty(relevant_times) && return ()
         p75 = quantile(relevant_times, 0.75)
         p95 = quantile(relevant_times, 0.95)
 

From d509855a414bedb120ca8f9c76152b3adabb44e8 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Sat, 12 Aug 2023 08:55:06 -0400
Subject: [PATCH 07/12] Fixes for CUDA 11.0.

---
 lib/cupti/wrappers.jl |  2 +-
 src/profile.jl        | 36 +++++++++++++++++++++++-------------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/lib/cupti/wrappers.jl b/lib/cupti/wrappers.jl
index bbda1d3527..b8ca450baa 100644
--- a/lib/cupti/wrappers.jl
+++ b/lib/cupti/wrappers.jl
@@ -163,7 +163,7 @@ function process(f, cfg::ActivityConfig)
         elseif cuda_version >= v"11.1"
             CUpti_ActivityMemcpy4
         else # v"11.0"
-            CUpti_ActivityMemset3
+            CUpti_ActivityMemcpy3
         end
     activity_types[CUPTI_ACTIVITY_KIND_MEMSET] =
         if cuda_version >= v"11.6"
diff --git a/src/profile.jl b/src/profile.jl
index a25db061ef..e2576cdcd3 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -183,9 +183,11 @@ function emit_native_profile(code, kwargs)
         # memory operations
         CUPTI.CUPTI_ACTIVITY_KIND_MEMCPY,
         CUPTI.CUPTI_ACTIVITY_KIND_MEMSET,
-        # additional information for API host calls
-        CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2,
     ]
+    if CUDA.runtime_version() >= v"11.2"
+        # additional information for API host calls
+        push!(activity_kinds, CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2)
+    end
 
     quote
         cfg = CUPTI.ActivityConfig($activity_kinds)
@@ -262,17 +264,25 @@ function generate_traces(cfg)
             id = record.correlationId
             t0, t1 = record.start/1e9, record._end/1e9
 
-            name = if record.kind == CUPTI.CUPTI_ACTIVITY_KIND_DRIVER
-                ref = Ref{Cstring}(C_NULL)
-                CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_DRIVER_API,
-                                            record.cbid, ref)
-                unsafe_string(ref[])
-            elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME
-                ref = Ref{Cstring}(C_NULL)
-                CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_RUNTIME_API,
-                                            record.cbid, ref)
-                unsafe_string(ref[])
-            else
+            name = try
+                if record.kind == CUPTI.CUPTI_ACTIVITY_KIND_DRIVER
+                    ref = Ref{Cstring}(C_NULL)
+                    CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_DRIVER_API,
+                                                record.cbid, ref)
+                    unsafe_string(ref[])
+                elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME
+                    ref = Ref{Cstring}(C_NULL)
+                    CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_RUNTIME_API,
+                                                record.cbid, ref)
+                    unsafe_string(ref[])
+                else
+                    "<unknown>"
+                end
+            catch err
+                # XXX: as observed on CUDA 11.0
+                (isa(err, CUPTIError) &&
+                 err.code == CUPTI.CUPTI_ERROR_INVALID_PARAMETER) ||
+                    rethrow()
                 "<unknown>"
             end
 

From 00ae803d0304975e5609be8816e14f172931ab78 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Sun, 13 Aug 2023 06:48:19 -0400
Subject: [PATCH 08/12] For simplicity, require CUDA 11.2.

---
 src/profile.jl       | 46 +++++++++++++++++++++-----------------------
 test/core/profile.jl |  4 +++-
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/profile.jl b/src/profile.jl
index e2576cdcd3..df13a3dd1c 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -8,7 +8,7 @@ Profile the GPU execution of `code`.
 
 There are two modes of operation, depending on whether `external` is `true` or `false`.
 
-## Native profiling (`external=false`, the default)
+## Integrated profiler (`external=false`, the default)
 
 In this mode, CUDA.jl will profile the execution of `code` and display the result. By
 default, both host-side and device-side activity is captured; this can be controlled with
@@ -24,7 +24,11 @@ slowest 25%, while entries colored in red are among the slowest 5% of all operat
 !!! compat "Julia 1.9"
     This functionality is only available on Julia 1.9 and later.
 
-## External profiling (`external=true`)
+!!! compat "CUDA 11.2"
+    Older versions of CUDA, before 11.2, contain bugs that may prevent the
+    `CUDA.@profile` macro to work. It is recommended to use a newer runtime.
+
+## External profilers (`external=true`)
 
 For more advanced profiling, it is possible to use an external profiling tool, such as
 NSight Systems or NSight Compute. When doing so, it is often advisable to only enable the
@@ -56,7 +60,7 @@ macro profile(ex...)
     if external
         Profile.emit_external_profile(code, remaining_kwargs)
     else
-        Profile.emit_native_profile(code, remaining_kwargs)
+        Profile.emit_integrated_profile(code, remaining_kwargs)
     end
 end
 
@@ -169,10 +173,10 @@ end
 
 
 #
-# native profiler
+# integrated profiler
 #
 
-function emit_native_profile(code, kwargs)
+function emit_integrated_profile(code, kwargs)
     activity_kinds = [
         # API calls
         CUPTI.CUPTI_ACTIVITY_KIND_DRIVER,
@@ -187,6 +191,8 @@ function emit_native_profile(code, kwargs)
     if CUDA.runtime_version() >= v"11.2"
         # additional information for API host calls
         push!(activity_kinds, CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2)
+    else
+        @warn "The integrated profiler is not supported on CUDA <11.2" maxlog=1
     end
 
     quote
@@ -264,25 +270,17 @@ function generate_traces(cfg)
             id = record.correlationId
             t0, t1 = record.start/1e9, record._end/1e9
 
-            name = try
-                if record.kind == CUPTI.CUPTI_ACTIVITY_KIND_DRIVER
-                    ref = Ref{Cstring}(C_NULL)
-                    CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_DRIVER_API,
-                                                record.cbid, ref)
-                    unsafe_string(ref[])
-                elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME
-                    ref = Ref{Cstring}(C_NULL)
-                    CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_RUNTIME_API,
-                                                record.cbid, ref)
-                    unsafe_string(ref[])
-                else
-                    "<unknown>"
-                end
-            catch err
-                # XXX: as observed on CUDA 11.0
-                (isa(err, CUPTIError) &&
-                 err.code == CUPTI.CUPTI_ERROR_INVALID_PARAMETER) ||
-                    rethrow()
+            name = if record.kind == CUPTI.CUPTI_ACTIVITY_KIND_DRIVER
+                ref = Ref{Cstring}(C_NULL)
+                CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_DRIVER_API,
+                                            record.cbid, ref)
+                unsafe_string(ref[])
+            elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_RUNTIME
+                ref = Ref{Cstring}(C_NULL)
+                CUPTI.cuptiGetCallbackName(CUPTI.CUPTI_CB_DOMAIN_RUNTIME_API,
+                                            record.cbid, ref)
+                unsafe_string(ref[])
+            else
                 "<unknown>"
             end
 
diff --git a/test/core/profile.jl b/test/core/profile.jl
index 0e7219e6a5..933ba09137 100644
--- a/test/core/profile.jl
+++ b/test/core/profile.jl
@@ -13,7 +13,8 @@ end
 
 ############################################################################################
 
-VERSION >= v"1.9" && @testset "native" begin
+if VERSION >= v"1.9" && CUDA.runtime_version() >= v"11.2"
+@testset "integrated" begin
 
 # smoke test
 let
@@ -76,6 +77,7 @@ let
     @test occursin("cuCtxSynchronize", str)
 end
 
+end
 end
 
 end

From 4c88bc42e15175fd542cd5d37640979ccc9ee053 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Sun, 13 Aug 2023 13:53:57 -0400
Subject: [PATCH 09/12] Fix doctests.

[skip tests]
---
 docs/src/usage/array.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index 1206717fdf..1cc13f2f6f 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -376,12 +376,12 @@ julia> b = CUDA.rand(2,2)
 
 julia> a \ b
 2×2 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
-  1.29018    0.942772
+  1.29018    0.942773
  -0.765663  -0.782648
 
 julia> Array(a) \ Array(b)
 2×2 Matrix{Float32}:
-  1.29018    0.942772
+  1.29018    0.942773
  -0.765663  -0.782648
 ```
 

From d4333f1dba4fa62ae1cdcb0c39c1f3a0ec8fd77d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 14 Aug 2023 09:16:35 +0200
Subject: [PATCH 10/12] Upgrade more of CI to 1.9.

[skip tests]
---
 .buildkite/pipeline.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 7c6edf4c6c..b3d9b8cde8 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -105,7 +105,7 @@ steps:
                 package: "cuTENSOR"
         plugins:
           - JuliaCI/julia#v1:
-              version: "1.8"
+              version: "1.9"
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src
@@ -143,7 +143,7 @@ steps:
       - label: "NNlibCUDA.jl on CUDA 12"
         plugins:
           - JuliaCI/julia#v1:
-              version: 1.8
+              version: 1.9
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src
@@ -178,7 +178,7 @@ steps:
       - label: "GPU-less environment"
         plugins:
           - JuliaCI/julia#v1:
-              version: 1.8
+              version: 1.9
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src
@@ -205,7 +205,7 @@ steps:
       - label: "Documentation"
         plugins:
           - JuliaCI/julia#v1:
-              version: 1.8
+              version: 1.9
         command: |
           julia --project -e '
             println("--- :julia: Instantiating project")
@@ -227,7 +227,7 @@ steps:
       - label: "Compute sanitizer"
         plugins:
           - JuliaCI/julia#v1:
-              version: 1.8
+              version: 1.9
           - JuliaCI/julia-test#v1:
               test_args: "--sanitize core"
           - JuliaCI/julia-coverage#v1:
@@ -256,7 +256,7 @@ steps:
       - label: "Benchmarks (dry run)"
         plugins:
           - JuliaCI/julia#v1:
-              version: 1.8
+              version: 1.9
         command: |
           julia --project -e '
             println("--- :julia: Instantiating project")

From 9f0e046161be23996acb9a8ecf4282fb49299737 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 14 Aug 2023 09:35:06 +0200
Subject: [PATCH 11/12] Add stronger error messages.

---
 src/profile.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/profile.jl b/src/profile.jl
index df13a3dd1c..8becb6c996 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -192,7 +192,10 @@ function emit_integrated_profile(code, kwargs)
         # additional information for API host calls
         push!(activity_kinds, CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2)
     else
-        @warn "The integrated profiler is not supported on CUDA <11.2" maxlog=1
+        @warn "The integrated profiler is not supported on CUDA <11.2, and may fail." maxlog=1
+    end
+    if VERSION < v"1.9"
+        @error "The integrated profiler is not supported on Julia <1.9, and will crash." maxlog=1
     end
 
     quote

From d3fcc7c1df65be34e01b2ab416516cd4b38c47f6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 14 Aug 2023 12:08:27 +0200
Subject: [PATCH 12/12] Don't run profile tests under compute sanitizer.

[skip julia]
[skip cuda]
[skip subpackages]
[skip benchmarks]
---
 test/core/profile.jl | 2 +-
 test/setup.jl        | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/test/core/profile.jl b/test/core/profile.jl
index 933ba09137..9ac5d41208 100644
--- a/test/core/profile.jl
+++ b/test/core/profile.jl
@@ -13,7 +13,7 @@ end
 
 ############################################################################################
 
-if VERSION >= v"1.9" && CUDA.runtime_version() >= v"11.2"
+if VERSION >= v"1.9" && CUDA.runtime_version() >= v"11.2" && can_use_cupti()
 @testset "integrated" begin
 
 # smoke test
diff --git a/test/setup.jl b/test/setup.jl
index c3349805cd..f8a01981a1 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -11,15 +11,8 @@ testf(f, xs...; kwargs...) = TestSuite.compare(f, CuArray, xs...; kwargs...)
 
 using Random
 
-# detect compute-sanitizer, to disable incompatible tests (e.g. using CUPTI),
-# and to skip tests that are known to generate innocuous API errors
+# detect compute-sanitizer, to disable incompatible tests (e.g. using CUPTI)
 const sanitize = any(contains("NV_SANITIZER"), keys(ENV))
-macro not_if_sanitize(ex)
-    sanitize || return esc(ex)
-    quote
-        @test_skip $ex
-    end
-end
 
 # in addition, CUPTI is not available on older GPUs with recent CUDA toolkits
 function can_use_cupti()