From 5b9ea8ac25946c4eb96c7278ec4e8fd828e4eef1 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Mon, 28 Nov 2022 19:55:04 +0100 Subject: [PATCH 1/2] kernels: Add scratch alloc limiter --- src/runtime/kernel.jl | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/runtime/kernel.jl b/src/runtime/kernel.jl index d27130948..93cb34e8d 100644 --- a/src/runtime/kernel.jl +++ b/src/runtime/kernel.jl @@ -115,9 +115,33 @@ function ROCKernel(kernel #= ::HostKernel =#; localmem::Int=0) group_segment_size = executable_symbol_kernel_group_segment_size(exec_symbol) group_segment_size = UInt32(max(group_segment_size, localmem)) private_segment_size = executable_symbol_kernel_private_segment_size(exec_symbol) + if private_segment_size > MAXIMUM_SCRATCH_ALLOCATION + @debug "Excessive scratch allocation requested\nReducing per-lane scratch to $(Int(MAXIMUM_SCRATCH_ALLOCATION)) bytes" + private_segment_size = MAXIMUM_SCRATCH_ALLOCATION + end kernel = ROCKernel(device, exe, symbol, kernel_object, kernarg_segment_size, group_segment_size, private_segment_size, Ptr{Cvoid}(0)) return kernel end + +"Sets the maximum amount of per-lane scratch memory that can be allocated for a +kernel. Consider setting this to a value below 2^14 if encountering +`QueueError`s with the `HSA.STATUS_ERROR_OUT_OF_RESOURCES` code." +set_max_scratch!(scratch::Integer) = + @set_preferences!("max_scratch"=>scratch) +const MAXIMUM_SCRATCH_ALLOCATION = let + if haskey(ENV, "JULIA_AMDGPU_MAX_SCRATCH") + scratch = ENV["JULIA_AMDGPU_MAX_SCRATCH"] + scratch = if uppercase(scratch) == "MAX" + typemax(UInt32) + else + parse(UInt32, scratch) + end + set_max_scratch!(scratch) + scratch + else + UInt32(@load_preference("max_scratch", 8192)) + end +end::UInt32 From b03d1558958293d23766aa3dc9d07ae42da2ae0d Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Wed, 7 Dec 2022 13:20:45 -0600 Subject: [PATCH 2/2] Mem: Add allocation limiter --- src/runtime/memory.jl | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/runtime/memory.jl b/src/runtime/memory.jl index 4009fc000..31135fe6b 100644 --- a/src/runtime/memory.jl +++ b/src/runtime/memory.jl @@ -294,6 +294,19 @@ const USE_HIP_MALLOC_OVERRIDE = let end end +"Sets a limit for total GPU memory allocations." +set_memory_alloc_limit!(limit::Integer) = + @set_preferences!("memory_alloc_limit" => limit) +const MEMORY_ALLOC_LIMIT = let + if haskey(ENV, "JULIA_AMDGPU_MEMORY_ALLOC_LIMIT") + limit = parse(Int, ENV["JULIA_AMDGPU_MEMORY_ALLOC_LIMIT"]) + set_memory_alloc_limit!(limit) + limit + else + @load_preference("memory_alloc_limit", typemax(Int)) + end +end + """ alloc(bytesize::Integer; coherent=false) -> Buffer @@ -388,20 +401,29 @@ function alloc_or_retry!(f) end end end +const ALL_ALLOCS = Threads.Atomic{Int64}(0) function alloc(device::ROCDevice, pool::ROCMemoryPool, bytesize::Integer) + if ALL_ALLOCS[] + bytesize > MEMORY_ALLOC_LIMIT + check(HSA.STATUS_ERROR_OUT_OF_RESOURCES) + end ptr_ref = Ref{Ptr{Cvoid}}() alloc_or_retry!() do HSA.amd_memory_pool_allocate(pool.pool, bytesize, 0, ptr_ref) end + Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize)) AMDGPU.hsaref!() ptr = ptr_ref[] return Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, Runtime.pool_accessible_by_all(pool), true) end function alloc(device::ROCDevice, region::ROCMemoryRegion, bytesize::Integer) + if ALL_ALLOCS[] + bytesize > MEMORY_ALLOC_LIMIT + check(HSA.STATUS_ERROR_OUT_OF_RESOURCES) + end ptr_ref = Ref{Ptr{Cvoid}}() alloc_or_retry!() do HSA.memory_allocate(region.region, bytesize, ptr_ref) end + Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize)) AMDGPU.hsaref!() ptr = ptr_ref[] return Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, Runtime.region_host_accessible(region), false) @@ -442,6 +464,7 @@ function free(buf::Buffer) else memory_check(HSA.amd_memory_pool_free(buf.base_ptr), buf.base_ptr) end + Threads.atomic_sub!(ALL_ALLOCS, Int64(buf.bytesize)) else memory_check(HSA.memory_free(buf.base_ptr), buf.base_ptr) end