Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

On CUDA 12.2, have the memory pool enforce hard memory limits. #2040

Merged
merged 2 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/src/usage/memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ device memory. You can change this using two environment variables:
synchronization points, so memory use may temporarily exceed this limit. In addition,
this limit is incompatible with `JULIA_CUDA_MEMORY_POOL=none`.
* `JULIA_CUDA_HARD_MEMORY_LIMIT`: This is a hard limit, checked before every allocation.
This incurs a certain cost, so it is recommended to first try to use the soft limit.
On older versions of CUDA, before v12.2, this is a relatively expensive limit, so it is
recommended to first try to use the soft limit.

The value of these variables can be formatted as a numer of bytes, optionally followed by
a unit, or as a percentage of the total device memory. Examples: `100M`, `50%`, `1.5GiB`,
Expand Down
6 changes: 3 additions & 3 deletions lib/cudadrv/pool.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ mutable struct CuMemoryPool
cuMemPoolCreate(handle_ref, props)

ctx = current_context()
obj = new(handle_ref[], ctx)
finalizer(unsafe_destroy!, obj)
return obj
new(handle_ref[], ctx)
# NOTE: we cannot attach a finalizer to this object, as the pool can be active
# without any references to it (similar to how contexts work).
end

global function default_memory_pool(dev::CuDevice)
Expand Down
38 changes: 27 additions & 11 deletions src/pool.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,13 @@ const alloc_stats = AllocStats()
## CUDA allocator

function actual_alloc(bytes::Integer; async::Bool=false,
stream::Union{CuStream,Nothing}=nothing)
stream::Union{CuStream,Nothing}=nothing,
pool::Union{CuMemoryPool,Nothing}=nothing)
memory_limit_exceeded(bytes) && return nothing

# try the actual allocation
buf = try
Mem.alloc(Mem.Device, bytes; async, stream)
Mem.alloc(Mem.Device, bytes; async, stream, pool)
catch err
isa(err, OutOfGPUMemoryError) || rethrow()
return nothing
Expand Down Expand Up @@ -136,8 +137,10 @@ function memory_limit_exceeded(bytes::Integer)
limit.hard > 0 || return false

dev = device()
used_bytes = if stream_ordered(dev) && driver_version() >= v"11.3"
# XXX: this should be done by the allocator itself (NVIDIA bug #3503815).
used_bytes = if stream_ordered(dev) && driver_version() >= v"12.2"
# we configured the memory pool to do this for us
return false
elseif stream_ordered(dev) && driver_version() >= v"11.3"
pool = memory_pool(dev)
Int(attribute(UInt64, pool, MEMPOOL_ATTR_RESERVED_MEM_CURRENT))
else
Expand Down Expand Up @@ -171,9 +174,20 @@ end
function pool_mark(dev::CuDevice)
status = pool_status(dev)
if status[] === nothing
# allow the pool to use up all memory of this device
limits = memory_limits()
attribute!(memory_pool(dev), MEMPOOL_ATTR_RELEASE_THRESHOLD,

# configure our memory pool
# XXX: is it OK to replace the pool like this? we need to, for setting limits.
# how do we get other applications/libraries to use it?
pool = if limits.hard > 0 && CUDA.driver_version() >= v"12.2"
CuMemoryPool(dev; maxSize=limits.hard)
else
CuMemoryPool(dev)
end
memory_pool!(dev, pool)

# allow the pool to use up all memory of this device
attribute!(pool, MEMPOOL_ATTR_RELEASE_THRESHOLD,
limits.soft == 0 ? typemax(UInt64) : limits.soft)

# launch a task to periodically trim the pool
Expand All @@ -184,9 +198,11 @@ function pool_mark(dev::CuDevice)
errormonitor(Threads.@spawn pool_cleanup())
end
end
else
pool = memory_pool(dev)
end
status[] = true
return
return pool
end

# reclaim unused pool memory after a certain time
Expand Down Expand Up @@ -415,18 +431,18 @@ an [`OutOfGPUMemoryError`](@ref) if the allocation request cannot be satisfied.
end
@inline function _alloc(::Type{Mem.DeviceBuffer}, sz; stream::Union{Nothing,CuStream})
state = active_state()
stream = something(stream, state.stream)

gctime = 0.0
time = Base.@elapsed begin
buf = if stream_ordered(state.device)
pool_mark(state.device) # mark the pool as active
stream = something(stream, state.stream)
pool = pool_mark(state.device) # mark the pool as active
retry_reclaim(isnothing) do
actual_alloc(sz; async=true, stream)
actual_alloc(sz; async=true, stream, pool)
end
else
retry_reclaim(isnothing) do
actual_alloc(sz; async=false, stream)
actual_alloc(sz; async=false)
end
end
buf === nothing && throw(OutOfGPUMemoryError(sz))
Expand Down
2 changes: 2 additions & 0 deletions test/core/cudadrv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,8 @@ memory_pool!(dev, pool)
attribute!(pool2, CUDA.MEMPOOL_ATTR_RELEASE_THRESHOLD, UInt64(2^30))
@test attribute(UInt64, pool2, CUDA.MEMPOOL_ATTR_RELEASE_THRESHOLD) == 2^30

CUDA.unsafe_destroy!(pool2)

end

end
Expand Down