JuliaGPU · maleadt · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023
diff --git a/docs/src/usage/memory.md b/docs/src/usage/memory.md
@@ -131,7 +131,8 @@ device memory. You can change this using two environment variables:
   synchronization points, so memory use may temporarily exceed this limit. In addition,
   this limit is incompatible with `JULIA_CUDA_MEMORY_POOL=none`.
 * `JULIA_CUDA_HARD_MEMORY_LIMIT`: This is a hard limit, checked before every allocation.
-  This incurs a certain cost, so it is recommended to first try to use the soft limit.
+  On older versions of CUDA, before v12.2, this is a relatively expensive limit, so it is
+  recommended to first try to use the soft limit.
 
 The value of these variables can be formatted as a numer of bytes, optionally followed by
 a unit, or as a percentage of the total device memory. Examples: `100M`, `50%`, `1.5GiB`,

diff --git a/lib/cudadrv/pool.jl b/lib/cudadrv/pool.jl
@@ -30,9 +30,9 @@ mutable struct CuMemoryPool
         cuMemPoolCreate(handle_ref, props)
 
         ctx = current_context()
-        obj = new(handle_ref[], ctx)
-        finalizer(unsafe_destroy!, obj)
-        return obj
+        new(handle_ref[], ctx)
+        # NOTE: we cannot attach a finalizer to this object, as the pool can be active
+        #       without any references to it (similar to how contexts work).
     end
 
     global function default_memory_pool(dev::CuDevice)

diff --git a/src/pool.jl b/src/pool.jl
@@ -57,12 +57,13 @@ const alloc_stats = AllocStats()
 ## CUDA allocator
 
 function actual_alloc(bytes::Integer; async::Bool=false,
-                      stream::Union{CuStream,Nothing}=nothing)
+                      stream::Union{CuStream,Nothing}=nothing,
+                      pool::Union{CuMemoryPool,Nothing}=nothing)
   memory_limit_exceeded(bytes) && return nothing
 
   # try the actual allocation
   buf = try
-    Mem.alloc(Mem.Device, bytes; async, stream)
+    Mem.alloc(Mem.Device, bytes; async, stream, pool)
   catch err
     isa(err, OutOfGPUMemoryError) || rethrow()
     return nothing
@@ -136,8 +137,10 @@ function memory_limit_exceeded(bytes::Integer)
   limit.hard > 0 || return false
 
   dev = device()
-  used_bytes = if stream_ordered(dev) && driver_version() >= v"11.3"
-    # XXX: this should be done by the allocator itself (NVIDIA bug #3503815).
+  used_bytes = if stream_ordered(dev) && driver_version() >= v"12.2"
+    # we configured the memory pool to do this for us
+    return false
+  elseif stream_ordered(dev) && driver_version() >= v"11.3"
     pool = memory_pool(dev)
     Int(attribute(UInt64, pool, MEMPOOL_ATTR_RESERVED_MEM_CURRENT))
   else
@@ -171,9 +174,20 @@ end
 function pool_mark(dev::CuDevice)
   status = pool_status(dev)
   if status[] === nothing
-      # allow the pool to use up all memory of this device
       limits = memory_limits()
-      attribute!(memory_pool(dev), MEMPOOL_ATTR_RELEASE_THRESHOLD,
+
+      # configure our memory pool
+      # XXX: is it OK to replace the pool like this? we need to, for setting limits.
+      #      how do we get other applications/libraries to use it?
+      pool = if limits.hard > 0 && CUDA.driver_version() >= v"12.2"
+        CuMemoryPool(dev; maxSize=limits.hard)
+      else
+        CuMemoryPool(dev)
+      end
+      memory_pool!(dev, pool)
+
+      # allow the pool to use up all memory of this device
+      attribute!(pool, MEMPOOL_ATTR_RELEASE_THRESHOLD,
                  limits.soft == 0 ? typemax(UInt64) : limits.soft)
 
       # launch a task to periodically trim the pool
@@ -184,9 +198,11 @@ function pool_mark(dev::CuDevice)
           errormonitor(Threads.@spawn pool_cleanup())
         end
       end
+  else
+      pool = memory_pool(dev)
   end
   status[] = true
-  return
+  return pool
 end
 
 # reclaim unused pool memory after a certain time
@@ -415,18 +431,18 @@ an [`OutOfGPUMemoryError`](@ref) if the allocation request cannot be satisfied.
 end
 @inline function _alloc(::Type{Mem.DeviceBuffer}, sz; stream::Union{Nothing,CuStream})
     state = active_state()
-    stream = something(stream, state.stream)
 
     gctime = 0.0
     time = Base.@elapsed begin
       buf = if stream_ordered(state.device)
-        pool_mark(state.device) # mark the pool as active
+        stream = something(stream, state.stream)
+        pool = pool_mark(state.device) # mark the pool as active
         retry_reclaim(isnothing) do
-          actual_alloc(sz; async=true, stream)
+          actual_alloc(sz; async=true, stream, pool)
         end
       else
         retry_reclaim(isnothing) do
-          actual_alloc(sz; async=false, stream)
+          actual_alloc(sz; async=false)
         end
       end
       buf === nothing && throw(OutOfGPUMemoryError(sz))

diff --git a/test/core/cudadrv.jl b/test/core/cudadrv.jl
@@ -818,6 +818,8 @@ memory_pool!(dev, pool)
 attribute!(pool2, CUDA.MEMPOOL_ATTR_RELEASE_THRESHOLD, UInt64(2^30))
 @test attribute(UInt64, pool2, CUDA.MEMPOOL_ATTR_RELEASE_THRESHOLD) == 2^30
 
+CUDA.unsafe_destroy!(pool2)
+
 end
 
 end
-Original file line number
+Diff line change
@@ Expand Up / @@ -818,6 +818,8 @@ memory_pool!(dev, pool) @@
     attribute!(pool2, CUDA.MEMPOOL_ATTR_RELEASE_THRESHOLD, UInt64(2^30))
     @test attribute(UInt64, pool2, CUDA.MEMPOOL_ATTR_RELEASE_THRESHOLD) == 2^30
+    CUDA.unsafe_destroy!(pool2)
     end
     end
@@ Expand Down @@