Merge pull request #46 from CliMA/ck/parameter_mem

Add new broken test for parameter memory
CliMA · Oct 9, 2024 · 96c32a0 · 96c32a0
2 parents bbc5c4a + 7e6e11d
commit 96c32a0
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 0 deletions.
diff --git a/test/execution/parameter_memory.jl b/test/execution/parameter_memory.jl
@@ -0,0 +1,83 @@
+#=
+using TestEnv
+TestEnv.activate()
+using CUDA # (optional)
+using Revise; include(joinpath("test", "execution", "parameter_memory.jl"))
+=#
+
+include("utils_test.jl")
+include("utils_setup.jl")
+include("utils_benchmark.jl")
+
+import MultiBroadcastFusion as MBF
+
+#! format: off
+function perf_kernel_shared_reads_fused!(X, Y)
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
+    # TODO: can we write this more compactly with `@fused_assemble`?
+
+    # Let's make sure that every broadcasted object is different,
+    # so that we use up a lot of parameter memory:
+    MBF.@fused_direct begin
+        @. y1 = x1
+        @. y2 = x1 + x2
+        @. y3 = x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4
+        @. y1 = x1 * x2 + x3 + x4 + x1
+        @. y2 = x1 * x2 + x3 + x4 + x1 + x2
+        @. y3 = x1 * x2 + x3 + x4 + x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4
+        @. y1 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1
+        @. y2 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2
+        @. y3 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4
+        @. y1 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1
+        @. y2 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2
+        @. y3 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4
+        @. y1 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1
+        @. y2 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2
+        @. y3 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4
+        @. y1 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1
+        @. y2 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2
+        @. y3 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4
+        @. y1 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1
+        @. y2 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2
+        @. y3 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4
+        @. y1 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1
+        @. y2 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2
+        @. y3 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 + x2 + x3
+        @. y4 = x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4 + x1 * x2 + x3 + x4
+    end
+end
+#! format: on
+
+@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
+use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
+AType = use_cuda ? CUDA.CuArray : Array
+device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
+bm = Benchmark(; device_name, float_type = Float32)
+problem_size = (50, 5, 5, 6, 5400)
+
+array_size = problem_size # array
+X = get_arrays(:x, AType, bm.float_type, array_size)
+Y = get_arrays(:y, AType, bm.float_type, array_size)
+@testset "Test breaking case with parameter memory" begin
+    if use_cuda
+        try
+            perf_kernel_shared_reads_fused!(X, Y)
+            error("The above kernel should error")
+        catch e
+            @test startswith(
+                e.msg,
+                "Kernel invocation uses too much parameter memory.",
+            )
+        end
+    end
+end
+
+nothing
diff --git a/test/execution/runtests.jl b/test/execution/runtests.jl
@@ -6,4 +6,5 @@ using Revise; include(joinpath("test", "execution", "runtests.jl"))
 @safetestset "fused_shared_reads" begin; @time include("bm_fused_shared_reads.jl"); end
 @safetestset "fused_shared_reads_writes" begin; @time include("bm_fused_shared_reads_writes.jl"); end
 @safetestset "bm_fused_reads_vs_hard_coded" begin; @time include("bm_fused_reads_vs_hard_coded.jl"); end
+@safetestset "parameter_memory" begin; @time include("parameter_memory.jl"); end
 #! format: on