From 5b9ea8ac25946c4eb96c7278ec4e8fd828e4eef1 Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Mon, 28 Nov 2022 19:55:04 +0100
Subject: [PATCH 1/2] kernels: Add scratch alloc limiter

---
 src/runtime/kernel.jl | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/runtime/kernel.jl b/src/runtime/kernel.jl
index d27130948..93cb34e8d 100644
--- a/src/runtime/kernel.jl
+++ b/src/runtime/kernel.jl
@@ -115,9 +115,33 @@ function ROCKernel(kernel #= ::HostKernel =#; localmem::Int=0)
     group_segment_size = executable_symbol_kernel_group_segment_size(exec_symbol)
     group_segment_size = UInt32(max(group_segment_size, localmem))
     private_segment_size = executable_symbol_kernel_private_segment_size(exec_symbol)
+    if private_segment_size > MAXIMUM_SCRATCH_ALLOCATION
+        @debug "Excessive scratch allocation requested\nReducing per-lane scratch to $(Int(MAXIMUM_SCRATCH_ALLOCATION)) bytes"
+        private_segment_size = MAXIMUM_SCRATCH_ALLOCATION
+    end
 
     kernel = ROCKernel(device, exe, symbol, kernel_object,
                        kernarg_segment_size, group_segment_size,
                        private_segment_size, Ptr{Cvoid}(0))
     return kernel
 end
+
+"Sets the maximum amount of per-lane scratch memory that can be allocated for a
+kernel. Consider setting this to a value below 2^14 if encountering
+`QueueError`s with the `HSA.STATUS_ERROR_OUT_OF_RESOURCES` code."
+set_max_scratch!(scratch::Integer) =
+    @set_preferences!("max_scratch"=>scratch)
+const MAXIMUM_SCRATCH_ALLOCATION = let
+    if haskey(ENV, "JULIA_AMDGPU_MAX_SCRATCH")
+        scratch = ENV["JULIA_AMDGPU_MAX_SCRATCH"]
+        scratch = if uppercase(scratch) == "MAX"
+            typemax(UInt32)
+        else
+            parse(UInt32, scratch)
+        end
+        set_max_scratch!(scratch)
+        scratch
+    else
+        UInt32(@load_preference("max_scratch", 8192))
+    end
+end::UInt32

From b03d1558958293d23766aa3dc9d07ae42da2ae0d Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Wed, 7 Dec 2022 13:20:45 -0600
Subject: [PATCH 2/2] Mem: Add allocation limiter

---
 src/runtime/memory.jl | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/runtime/memory.jl b/src/runtime/memory.jl
index 4009fc000..31135fe6b 100644
--- a/src/runtime/memory.jl
+++ b/src/runtime/memory.jl
@@ -294,6 +294,19 @@ const USE_HIP_MALLOC_OVERRIDE = let
     end
 end
 
+"Sets a limit for total GPU memory allocations."
+set_memory_alloc_limit!(limit::Integer) =
+    @set_preferences!("memory_alloc_limit" => limit)
+const MEMORY_ALLOC_LIMIT = let
+    if haskey(ENV, "JULIA_AMDGPU_MEMORY_ALLOC_LIMIT")
+        limit = parse(Int, ENV["JULIA_AMDGPU_MEMORY_ALLOC_LIMIT"])
+        set_memory_alloc_limit!(limit)
+        limit
+    else
+        @load_preference("memory_alloc_limit", typemax(Int))
+    end
+end
+
 """
     alloc(bytesize::Integer; coherent=false) -> Buffer
 
@@ -388,20 +401,29 @@ function alloc_or_retry!(f)
         end
     end
 end
+const ALL_ALLOCS = Threads.Atomic{Int64}(0)
 function alloc(device::ROCDevice, pool::ROCMemoryPool, bytesize::Integer)
+    if ALL_ALLOCS[] + bytesize > MEMORY_ALLOC_LIMIT
+        check(HSA.STATUS_ERROR_OUT_OF_RESOURCES)
+    end
     ptr_ref = Ref{Ptr{Cvoid}}()
     alloc_or_retry!() do
         HSA.amd_memory_pool_allocate(pool.pool, bytesize, 0, ptr_ref)
     end
+    Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
     AMDGPU.hsaref!()
     ptr = ptr_ref[]
     return Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, Runtime.pool_accessible_by_all(pool), true)
 end
 function alloc(device::ROCDevice, region::ROCMemoryRegion, bytesize::Integer)
+    if ALL_ALLOCS[] + bytesize > MEMORY_ALLOC_LIMIT
+        check(HSA.STATUS_ERROR_OUT_OF_RESOURCES)
+    end
     ptr_ref = Ref{Ptr{Cvoid}}()
     alloc_or_retry!() do
         HSA.memory_allocate(region.region, bytesize, ptr_ref)
     end
+    Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
     AMDGPU.hsaref!()
     ptr = ptr_ref[]
     return Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, Runtime.region_host_accessible(region), false)
@@ -442,6 +464,7 @@ function free(buf::Buffer)
             else
                 memory_check(HSA.amd_memory_pool_free(buf.base_ptr), buf.base_ptr)
             end
+            Threads.atomic_sub!(ALL_ALLOCS, Int64(buf.bytesize))
         else
             memory_check(HSA.memory_free(buf.base_ptr), buf.base_ptr)
         end