JuliaGPU · christiangnrd · Oct 17, 2024 · Oct 17, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -38,7 +38,7 @@ steps:
           #     soft_fail: true
 
   # special tests
-  - group: ":eyes: Special"
+  - group: ":floppy_disk: Storage Modes"
     depends_on: "julia"
     steps:
       - label: "{{matrix.storage}} array storage"
@@ -53,10 +53,10 @@ steps:
           arch: "aarch64"
         if: |
           build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only special\]/ ||
+          build.message =~ /\[only storage\]/ ||
           build.message !~ /\[only/ && !build.pull_request.draft &&
             build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip special\]/
+            build.message !~ /\[skip storage\]/
         timeout_in_minutes: 60
         matrix:
           setup:
@@ -65,13 +65,20 @@ steps:
               - "managed"
         commands: |
           echo -e "[Metal]\ndefault_storage = \"{{matrix.storage}}\"" >LocalPreferences.toml
+
+  # special tests
+  - group: ":eyes: Special"
+    depends_on: "julia"
+    steps:
       - label: "API validation"
+        soft_fail: true
         plugins:
           - JuliaCI/julia#v1:
               version: "1.10"
           - JuliaCI/julia-test#v1:
-              test_args: "--quickfail"
-              # only enabled for select tests due to JuliaGPU/Metal.jl#34
+              # test_args: "--quickfail"
+              test_args: ""
+              # Don't quickfail to see which ones fail
           - JuliaCI/julia-coverage#v1:
               codecov: true
               dirs:
@@ -88,9 +95,11 @@ steps:
           macos_version: "15.0"
         if: |
           build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only validation\]/ ||
           build.message =~ /\[only special\]/ ||
           build.message !~ /\[only/ && !build.pull_request.draft &&
             build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip validation\]/ &&
             build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
       - label: "Opaque pointers"

diff --git a/Project.toml b/Project.toml
@@ -40,7 +40,7 @@ BFloat16s = "0.5"
 CEnum = "0.4, 0.5"
 CodecBzip2 = "0.8"
 ExprTools = "0.1"
-GPUArrays = "11"
+GPUArrays = "10.1"
 GPUCompiler = "0.26, 0.27, 1"
 KernelAbstractions = "0.9.1"
 LLVM = "7.2, 8, 9"

diff --git a/lib/mtl/library.jl b/lib/mtl/library.jl
@@ -23,13 +23,10 @@ end
 
 function MTLLibraryFromFile(dev::MTLDevice, path::String)
     err = Ref{id{NSError}}(nil)
-    handle = if Metal.macos_version() >= v"13"
+    handle = let
         url = NSFileURL(path)
         @objc [dev::id{MTLDevice} newLibraryWithURL:url::id{NSURL}
                                      error:err::Ptr{id{NSError}}]::id{MTLLibrary}
-    else
-        @objc [dev::id{MTLDevice} newLibraryWithFile:path::id{NSString}
-                                     error:err::Ptr{id{NSError}}]::id{MTLLibrary}
     end
     err[] == nil || throw(NSError(err[]))
 

diff --git a/src/array.jl b/src/array.jl
@@ -532,10 +532,8 @@ fill(v::T, dims...; storage=DefaultStorageMode) where T = fill!(MtlArray{T,lengt
 
 # optimized implementation of `fill!` for types that are directly supported by fillbuffer
 function Base.fill!(A::MtlArray{T}, val) where T <: Union{UInt8,Int8}
-    if length(A) > 0
-        B = convert(T, val)
-        unsafe_fill!(device(A), pointer(A), B, length(A))
-    end
+    B = convert(T, val)
+    unsafe_fill!(device(A), pointer(A), B, length(A))
     A
 end
 

diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
@@ -40,9 +40,6 @@ end
     if !job.config.kernel
         error("Can only generate AGX code for kernel functions")
     end
-    if macos_version() < v"13"
-        error("Native code reflection is only supported on OSX 13 or higher")
-    end
 
     # compile the kernel
     compiled = compile(job)
@@ -99,7 +96,7 @@ function extract_gpu_code(f, binary)
     arch = findfirst(fat_handle) do arch
         arch.header isa MachO.MachOHeader64 && GPUMachineType(arch.header.cputype) == AppleGPU
     end
-    arch == nothing && error("Could not find GPU architecture in universal binary")
+    arch === nothing && error("Could not find GPU architecture in universal binary")
 
     # the GPU binary contains several sections...
     ## ... extract the compute section, which is another Mach-O binary

diff --git a/src/device/intrinsics/memory.jl b/src/device/intrinsics/memory.jl
@@ -5,32 +5,12 @@ export MtlThreadGroupArray
 
 Create an array local to each threadgroup launched during kernel execution.
 """
-MtlThreadGroupArray
-
-@static if Sys.isapple() && macos_version() >= v"13.0"
-    @inline function MtlThreadGroupArray(::Type{T}, dims) where {T}
-        len = prod(dims)
-        # NOTE: this relies on const-prop to forward the literal length to the generator.
-        #       maybe we should include the size in the type, like StaticArrays does?
-        ptr = emit_threadgroup_memory(T, Val(len))
-        MtlDeviceArray(dims, ptr)
-    end
-else
-    # on older macOS, shared memory with small types results in miscompilation (Metal.jl#26),
-    # so we use an array wrapper extending the element size to the minimum known to work.
-    # this was fixed in macOS 13 beta 4 (22A5311f).
-
-    @inline function MtlThreadGroupArray(::Type{T}, dims) where {T}
-        len = prod(dims)
-        if sizeof(T) >= 4
-            ptr = emit_threadgroup_memory(T, Val(len))
-            MtlDeviceArray(dims, ptr)
-        else
-            ptr = emit_threadgroup_memory(UInt32, Val(len))
-            arr = MtlDeviceArray(dims, ptr)
-            MtlLargerDeviceArray{T,ndims(arr),AS.ThreadGroup}(arr)
-        end
-    end
+@inline function MtlThreadGroupArray(::Type{T}, dims) where {T}
+    len = prod(dims)
+    # NOTE: this relies on const-prop to forward the literal length to the generator.
+    #       maybe we should include the size in the type, like StaticArrays does?
+    ptr = emit_threadgroup_memory(T, Val(len))
+    MtlDeviceArray(dims, ptr)
 end
 
 # get a pointer to threadgroup memory, with known (static) or zero length (dynamic)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
@@ -1,5 +1,59 @@
 ## GPUArrays interfaces
 
+## execution
+
+struct mtlArrayBackend <: AbstractGPUBackend end
+
+struct mtlKernelContext <: AbstractKernelContext end
+
+@inline function GPUArrays.launch_heuristic(::mtlArrayBackend, f::F, args::Vararg{Any,N};
+                                             elements::Int, elements_per_thread::Int) where {F,N}
+    kernel = @metal launch=false f(mtlKernelContext(), args...)
+
+    # The pipeline state automatically computes occupancy stats
+    threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup)
+    blocks  = cld(elements, threads)
+
+    return (; threads=Int(threads), blocks=Int(blocks))
+end
+
+function GPUArrays.gpu_call(::mtlArrayBackend, f, args, threads::Int, groups::Int;
+                            name::Union{String,Nothing})
+    @metal threads groups name f(mtlKernelContext(), args...)
+end
+
+
+## on-device
+
+# indexing
+GPUArrays.blockidx(ctx::mtlKernelContext)     = threadgroup_position_in_grid_1d()
+GPUArrays.blockdim(ctx::mtlKernelContext)     = threads_per_threadgroup_1d()
+GPUArrays.threadidx(ctx::mtlKernelContext)    = thread_position_in_threadgroup_1d()
+GPUArrays.griddim(ctx::mtlKernelContext)      = threadgroups_per_grid_1d()
+GPUArrays.global_index(ctx::mtlKernelContext) = thread_position_in_grid_1d()
+GPUArrays.global_size(ctx::mtlKernelContext)  = threads_per_grid_1d()
+
+# memory
+
+@inline function GPUArrays.LocalMemory(::mtlKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
+                                      ) where {T, dims, id}
+    ptr = emit_threadgroup_memory(T, Val(prod(dims)))
+    MtlDeviceArray(dims, ptr)
+end
+
+# synchronization
+
+@inline GPUArrays.synchronize_threads(::mtlKernelContext) =
+    threadgroup_barrier(MemoryFlagThreadGroup)
+
+
+
+#
+# Host abstractions
+#
+
+GPUArrays.backend(::Type{<:MtlArray}) = mtlArrayBackend()
+
 const GLOBAL_RNGs = Dict{MTLDevice,GPUArrays.RNG}()
 function GPUArrays.default_rng(::Type{<:MtlArray})
     dev = device()

diff --git a/src/initialization.jl b/src/initialization.jl
@@ -40,10 +40,8 @@ function __init__()
         ENV["MTL_DEBUG_LAYER_ERROR_MODE"] = "nslog"
         ENV["MTL_DEBUG_LAYER_WARNING_MODE"] = "nslog"
 
-        if macos_version() >= v"13"
-            # enable Metal shader validation
-            ENV["MTL_SHADER_VALIDATION"] = "4"
-        end
+        # enable Metal shader validation
+        ENV["MTL_SHADER_VALIDATION"] = "4"
     end
 
     @autoreleasepool try

diff --git a/test/array.jl b/test/array.jl
@@ -238,8 +238,8 @@ end
         @test Array(A) == B
     end
 
-    let M = Metal.fill(b, (10, 10, 10, 1000))
-        B = fill(b, (10, 10, 10, 1000))
+    let M = Metal.fill(b, (10, 10))
+        B = fill(b, (10, 10))
         @test Array(M) == B
     end
 
@@ -249,8 +249,8 @@ end
     end
 
     #Dims already unpacked
-    let A = Metal.fill(b, 10, 10, 10, 1000)
-        B = fill(b, 10, 10, 10, 1000)
+    let A = Metal.fill(b, 10, 1000, 1000)
+        B = fill(b, 10, 1000, 1000)
         @test Array(A) == B
     end
 
@@ -271,7 +271,7 @@ end
     b = rand(T)
 
     # Dims in tuple
-    let A = MtlArray{T,3}(undef, (10, 10, 10))
+    let A = MtlArray{T,3}(undef, (10, 1000, 1000))
         fill!(A, b)
         @test all(Array(A) .== b)
     end
@@ -287,7 +287,7 @@ end
     end
 
     # Dims already unpacked
-    let A = MtlArray{T,3}(undef, 10, 10, 10)
+    let A = MtlArray{T,4}(undef, 10, 10, 10, 1000)
         fill!(A, b)
         @test all(Array(A) .== b)
     end

diff --git a/test/capturing.jl b/test/capturing.jl
@@ -26,7 +26,7 @@ manager = MTLCaptureManager()
 # Capture Descriptor
 desc = MTLCaptureDescriptor()
 # Capture Object
-@test desc.captureObject == nothing
+@test desc.captureObject === nothing
 cmdq = global_queue(device())
 desc.captureObject = cmdq
 @test desc.captureObject == cmdq
@@ -40,19 +40,19 @@ desc.destination = MTL.MTLCaptureDestinationGPUTraceDocument
 @test desc.destination == MTL.MTLCaptureDestinationGPUTraceDocument
 
 # Output URL
-@test desc.outputURL == nothing
+@test desc.outputURL === nothing
 path = joinpath(tmpdir, "test.gputrace")
 desc.outputURL = NSFileURL(path)
 @test desc.outputURL == NSFileURL(path)
 
 # Capture Scope
 queue = MTLCommandQueue(device())
 default_scope = manager.defaultCaptureScope
-@test default_scope == nothing
+@test default_scope === nothing
 new_scope = MTLCaptureScope(@objc [manager::id{MTLCaptureManager} newCaptureScopeWithCommandQueue:queue::id{MTLCommandQueue}]::id{MTLCaptureScope})
 @test new_scope.commandQueue == queue
 @test new_scope.device == device()
-@test new_scope.label == nothing
+@test new_scope.label === nothing
 new_label = "Metal.jl capturing test"
 new_scope.label = new_label
 @test new_scope.label == new_label

diff --git a/test/execution.jl b/test/execution.jl
@@ -59,17 +59,13 @@ end
     Metal.code_typed(dummy, Tuple{})
     Metal.code_warntype(devnull, dummy, Tuple{})
     Metal.code_llvm(devnull, dummy, Tuple{})
-    if Metal.macos_version() >= v"13"
-        shader_validation || Metal.code_agx(devnull, dummy, Tuple{})
-    end
+    shader_validation || Metal.code_agx(devnull, dummy, Tuple{})
 
     @device_code_lowered @metal dummy()
     @device_code_typed @metal dummy()
     @device_code_warntype io=devnull @metal dummy()
     @device_code_llvm io=devnull @metal dummy()
-    if Metal.macos_version() >= v"13"
-        shader_validation || @device_code_agx io=devnull @metal dummy()
-    end
+    shader_validation || @device_code_agx io=devnull @metal dummy()
 
     mktempdir() do dir
         @device_code dir=dir @metal dummy()
@@ -80,9 +76,7 @@ end
     # make sure kernel name aliases are preserved in the generated code
     @test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @metal dummy())))
     @test occursin("dummy", sprint(io->(@device_code_llvm io=io @metal dummy())))
-    if Metal.macos_version() >= v"13"
-        shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))
-    end
+    shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))
 
     # make sure invalid kernels can be partially reflected upon
     let