Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[do not merge] Test #464 without KA commit #468

Closed
wants to merge 14 commits into from
Closed
19 changes: 14 additions & 5 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ steps:
# soft_fail: true

# special tests
- group: ":eyes: Special"
- group: ":floppy_disk: Storage Modes"
depends_on: "julia"
steps:
- label: "{{matrix.storage}} array storage"
Expand All @@ -53,10 +53,10 @@ steps:
arch: "aarch64"
if: |
build.message =~ /\[only tests\]/ ||
build.message =~ /\[only special\]/ ||
build.message =~ /\[only storage\]/ ||
build.message !~ /\[only/ && !build.pull_request.draft &&
build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip special\]/
build.message !~ /\[skip storage\]/
timeout_in_minutes: 60
matrix:
setup:
Expand All @@ -65,13 +65,20 @@ steps:
- "managed"
commands: |
echo -e "[Metal]\ndefault_storage = \"{{matrix.storage}}\"" >LocalPreferences.toml

# special tests
- group: ":eyes: Special"
depends_on: "julia"
steps:
- label: "API validation"
soft_fail: true
plugins:
- JuliaCI/julia#v1:
version: "1.10"
- JuliaCI/julia-test#v1:
test_args: "--quickfail"
# only enabled for select tests due to JuliaGPU/Metal.jl#34
# test_args: "--quickfail"
test_args: ""
# Don't quickfail to see which ones fail
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
Expand All @@ -88,9 +95,11 @@ steps:
macos_version: "15.0"
if: |
build.message =~ /\[only tests\]/ ||
build.message =~ /\[only validation\]/ ||
build.message =~ /\[only special\]/ ||
build.message !~ /\[only/ && !build.pull_request.draft &&
build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip validation\]/ &&
build.message !~ /\[skip special\]/
timeout_in_minutes: 60
- label: "Opaque pointers"
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ BFloat16s = "0.5"
CEnum = "0.4, 0.5"
CodecBzip2 = "0.8"
ExprTools = "0.1"
GPUArrays = "11"
GPUArrays = "10.1"
GPUCompiler = "0.26, 0.27, 1"
KernelAbstractions = "0.9.1"
LLVM = "7.2, 8, 9"
Expand Down
5 changes: 1 addition & 4 deletions lib/mtl/library.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,10 @@ end

function MTLLibraryFromFile(dev::MTLDevice, path::String)
err = Ref{id{NSError}}(nil)
handle = if Metal.macos_version() >= v"13"
handle = let
url = NSFileURL(path)
@objc [dev::id{MTLDevice} newLibraryWithURL:url::id{NSURL}
error:err::Ptr{id{NSError}}]::id{MTLLibrary}
else
@objc [dev::id{MTLDevice} newLibraryWithFile:path::id{NSString}
error:err::Ptr{id{NSError}}]::id{MTLLibrary}
end
err[] == nil || throw(NSError(err[]))

Expand Down
6 changes: 2 additions & 4 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -532,10 +532,8 @@ fill(v::T, dims...; storage=DefaultStorageMode) where T = fill!(MtlArray{T,lengt

# optimized implementation of `fill!` for types that are directly supported by fillbuffer
function Base.fill!(A::MtlArray{T}, val) where T <: Union{UInt8,Int8}
if length(A) > 0
B = convert(T, val)
unsafe_fill!(device(A), pointer(A), B, length(A))
end
B = convert(T, val)
unsafe_fill!(device(A), pointer(A), B, length(A))
A
end

Expand Down
5 changes: 1 addition & 4 deletions src/compiler/reflection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ end
if !job.config.kernel
error("Can only generate AGX code for kernel functions")
end
if macos_version() < v"13"
error("Native code reflection is only supported on OSX 13 or higher")
end

# compile the kernel
compiled = compile(job)
Expand Down Expand Up @@ -99,7 +96,7 @@ function extract_gpu_code(f, binary)
arch = findfirst(fat_handle) do arch
arch.header isa MachO.MachOHeader64 && GPUMachineType(arch.header.cputype) == AppleGPU
end
arch == nothing && error("Could not find GPU architecture in universal binary")
arch === nothing && error("Could not find GPU architecture in universal binary")

# the GPU binary contains several sections...
## ... extract the compute section, which is another Mach-O binary
Expand Down
32 changes: 6 additions & 26 deletions src/device/intrinsics/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,12 @@ export MtlThreadGroupArray

Create an array local to each threadgroup launched during kernel execution.
"""
MtlThreadGroupArray

@static if Sys.isapple() && macos_version() >= v"13.0"
@inline function MtlThreadGroupArray(::Type{T}, dims) where {T}
len = prod(dims)
# NOTE: this relies on const-prop to forward the literal length to the generator.
# maybe we should include the size in the type, like StaticArrays does?
ptr = emit_threadgroup_memory(T, Val(len))
MtlDeviceArray(dims, ptr)
end
else
# on older macOS, shared memory with small types results in miscompilation (Metal.jl#26),
# so we use an array wrapper extending the element size to the minimum known to work.
# this was fixed in macOS 13 beta 4 (22A5311f).

@inline function MtlThreadGroupArray(::Type{T}, dims) where {T}
len = prod(dims)
if sizeof(T) >= 4
ptr = emit_threadgroup_memory(T, Val(len))
MtlDeviceArray(dims, ptr)
else
ptr = emit_threadgroup_memory(UInt32, Val(len))
arr = MtlDeviceArray(dims, ptr)
MtlLargerDeviceArray{T,ndims(arr),AS.ThreadGroup}(arr)
end
end
@inline function MtlThreadGroupArray(::Type{T}, dims) where {T}
len = prod(dims)
# NOTE: this relies on const-prop to forward the literal length to the generator.
# maybe we should include the size in the type, like StaticArrays does?
ptr = emit_threadgroup_memory(T, Val(len))
MtlDeviceArray(dims, ptr)
end

# get a pointer to threadgroup memory, with known (static) or zero length (dynamic)
Expand Down
54 changes: 54 additions & 0 deletions src/gpuarrays.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,59 @@
## GPUArrays interfaces

## execution

struct mtlArrayBackend <: AbstractGPUBackend end

struct mtlKernelContext <: AbstractKernelContext end

@inline function GPUArrays.launch_heuristic(::mtlArrayBackend, f::F, args::Vararg{Any,N};
elements::Int, elements_per_thread::Int) where {F,N}
kernel = @metal launch=false f(mtlKernelContext(), args...)

# The pipeline state automatically computes occupancy stats
threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup)
blocks = cld(elements, threads)

return (; threads=Int(threads), blocks=Int(blocks))
end

function GPUArrays.gpu_call(::mtlArrayBackend, f, args, threads::Int, groups::Int;
name::Union{String,Nothing})
@metal threads groups name f(mtlKernelContext(), args...)
end


## on-device

# indexing
GPUArrays.blockidx(ctx::mtlKernelContext) = threadgroup_position_in_grid_1d()
GPUArrays.blockdim(ctx::mtlKernelContext) = threads_per_threadgroup_1d()
GPUArrays.threadidx(ctx::mtlKernelContext) = thread_position_in_threadgroup_1d()
GPUArrays.griddim(ctx::mtlKernelContext) = threadgroups_per_grid_1d()
GPUArrays.global_index(ctx::mtlKernelContext) = thread_position_in_grid_1d()
GPUArrays.global_size(ctx::mtlKernelContext) = threads_per_grid_1d()

# memory

@inline function GPUArrays.LocalMemory(::mtlKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
) where {T, dims, id}
ptr = emit_threadgroup_memory(T, Val(prod(dims)))
MtlDeviceArray(dims, ptr)
end

# synchronization

@inline GPUArrays.synchronize_threads(::mtlKernelContext) =
threadgroup_barrier(MemoryFlagThreadGroup)



#
# Host abstractions
#

GPUArrays.backend(::Type{<:MtlArray}) = mtlArrayBackend()

const GLOBAL_RNGs = Dict{MTLDevice,GPUArrays.RNG}()
function GPUArrays.default_rng(::Type{<:MtlArray})
dev = device()
Expand Down
6 changes: 2 additions & 4 deletions src/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,8 @@ function __init__()
ENV["MTL_DEBUG_LAYER_ERROR_MODE"] = "nslog"
ENV["MTL_DEBUG_LAYER_WARNING_MODE"] = "nslog"

if macos_version() >= v"13"
# enable Metal shader validation
ENV["MTL_SHADER_VALIDATION"] = "4"
end
# enable Metal shader validation
ENV["MTL_SHADER_VALIDATION"] = "4"
end

@autoreleasepool try
Expand Down
12 changes: 6 additions & 6 deletions test/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,8 @@ end
@test Array(A) == B
end

let M = Metal.fill(b, (10, 10, 10, 1000))
B = fill(b, (10, 10, 10, 1000))
let M = Metal.fill(b, (10, 10))
B = fill(b, (10, 10))
@test Array(M) == B
end

Expand All @@ -249,8 +249,8 @@ end
end

#Dims already unpacked
let A = Metal.fill(b, 10, 10, 10, 1000)
B = fill(b, 10, 10, 10, 1000)
let A = Metal.fill(b, 10, 1000, 1000)
B = fill(b, 10, 1000, 1000)
@test Array(A) == B
end

Expand All @@ -271,7 +271,7 @@ end
b = rand(T)

# Dims in tuple
let A = MtlArray{T,3}(undef, (10, 10, 10))
let A = MtlArray{T,3}(undef, (10, 1000, 1000))
fill!(A, b)
@test all(Array(A) .== b)
end
Expand All @@ -287,7 +287,7 @@ end
end

# Dims already unpacked
let A = MtlArray{T,3}(undef, 10, 10, 10)
let A = MtlArray{T,4}(undef, 10, 10, 10, 1000)
fill!(A, b)
@test all(Array(A) .== b)
end
Expand Down
8 changes: 4 additions & 4 deletions test/capturing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ manager = MTLCaptureManager()
# Capture Descriptor
desc = MTLCaptureDescriptor()
# Capture Object
@test desc.captureObject == nothing
@test desc.captureObject === nothing
cmdq = global_queue(device())
desc.captureObject = cmdq
@test desc.captureObject == cmdq
Expand All @@ -40,19 +40,19 @@ desc.destination = MTL.MTLCaptureDestinationGPUTraceDocument
@test desc.destination == MTL.MTLCaptureDestinationGPUTraceDocument

# Output URL
@test desc.outputURL == nothing
@test desc.outputURL === nothing
path = joinpath(tmpdir, "test.gputrace")
desc.outputURL = NSFileURL(path)
@test desc.outputURL == NSFileURL(path)

# Capture Scope
queue = MTLCommandQueue(device())
default_scope = manager.defaultCaptureScope
@test default_scope == nothing
@test default_scope === nothing
new_scope = MTLCaptureScope(@objc [manager::id{MTLCaptureManager} newCaptureScopeWithCommandQueue:queue::id{MTLCommandQueue}]::id{MTLCaptureScope})
@test new_scope.commandQueue == queue
@test new_scope.device == device()
@test new_scope.label == nothing
@test new_scope.label === nothing
new_label = "Metal.jl capturing test"
new_scope.label = new_label
@test new_scope.label == new_label
Expand Down
12 changes: 3 additions & 9 deletions test/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,13 @@ end
Metal.code_typed(dummy, Tuple{})
Metal.code_warntype(devnull, dummy, Tuple{})
Metal.code_llvm(devnull, dummy, Tuple{})
if Metal.macos_version() >= v"13"
shader_validation || Metal.code_agx(devnull, dummy, Tuple{})
end
shader_validation || Metal.code_agx(devnull, dummy, Tuple{})

@device_code_lowered @metal dummy()
@device_code_typed @metal dummy()
@device_code_warntype io=devnull @metal dummy()
@device_code_llvm io=devnull @metal dummy()
if Metal.macos_version() >= v"13"
shader_validation || @device_code_agx io=devnull @metal dummy()
end
shader_validation || @device_code_agx io=devnull @metal dummy()

mktempdir() do dir
@device_code dir=dir @metal dummy()
Expand All @@ -80,9 +76,7 @@ end
# make sure kernel name aliases are preserved in the generated code
@test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @metal dummy())))
@test occursin("dummy", sprint(io->(@device_code_llvm io=io @metal dummy())))
if Metal.macos_version() >= v"13"
shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))
end
shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))

# make sure invalid kernels can be partially reflected upon
let
Expand Down
Loading