From ac8a03e0a1408e4879473111205251b15f131e6d Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:13:47 -0300
Subject: [PATCH 01/15] Add MPSMatrixRandom

---
 lib/mps/MPS.jl          |   1 +
 lib/mps/matrixrandom.jl | 146 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 lib/mps/matrixrandom.jl

diff --git a/lib/mps/MPS.jl b/lib/mps/MPS.jl
index 2bb794187..e33a156b9 100644
--- a/lib/mps/MPS.jl
+++ b/lib/mps/MPS.jl
@@ -28,6 +28,7 @@ include("kernel.jl")
 include("images.jl")
 include("matrix.jl")
 include("vector.jl")
+include("matrixrandom.jl")
 include("decomposition.jl")
 include("copy.jl")
 
diff --git a/lib/mps/matrixrandom.jl b/lib/mps/matrixrandom.jl
new file mode 100644
index 000000000..0d5eb8741
--- /dev/null
+++ b/lib/mps/matrixrandom.jl
@@ -0,0 +1,146 @@
+@cenum MPSMatrixRandomDistribution::UInt begin
+    MPSMatrixRandomDistributionDefault = 1
+    MPSMatrixRandomDistributionUniform = 2
+    MPSMatrixRandomDistributionNormal  = 3
+end
+
+#
+# matrix random descriptor
+#
+
+export MPSMatrixRandomDistributionDescriptor
+
+@objcwrapper immutable=false MPSMatrixRandomDistributionDescriptor <: NSObject
+
+@objcproperties MPSMatrixRandomDistributionDescriptor begin
+    @autoproperty distributionType::MPSMatrixRandomDistribution
+    @autoproperty maximum::Float32 setter=setMaximum
+    @autoproperty mean::Float32 setter=setMean
+    @autoproperty minimum::Float32 setter=setMimimum
+    @autoproperty standardDeviation::Float32 setter=setStandardDeviation
+end
+
+
+function MPSMatrixRandomDefaultDistributionDescriptor()
+    desc = @objc [MPSMatrixRandomDistributionDescriptor defaultDistributionDescriptor]::id{MPSMatrixRandomDistributionDescriptor}
+    obj = MPSMatrixRandomDistributionDescriptor(desc)
+    # XXX: who releases this object?
+    return obj
+end
+
+# Default constructor
+MPSMatrixRandomDistributionDescriptor() = MPSMatrixRandomDefaultDistributionDescriptor()
+
+function MPSMatrixRandomNormalDistributionDescriptor(mean, standardDeviation)
+    desc = @objc [MPSMatrixRandomDistributionDescriptor normalDistributionDescriptorWithMean:mean::Float32
+                                                        standardDeviation:standardDeviation::Float32]::id{MPSMatrixRandomDistributionDescriptor}
+    obj = MPSMatrixRandomDistributionDescriptor(desc)
+    # XXX: who releases this object?
+    return obj
+end
+
+function MPSMatrixRandomNormalDistributionDescriptor(mean, standardDeviation, minimum, maximum)
+    desc = @objc [MPSMatrixRandomDistributionDescriptor normalDistributionDescriptorWithMean:mean::Float32
+                                                        standardDeviation:standardDeviation::Float32
+                                                        minimum:minimum::Float32
+                                                        maximum:maximum::Float32]::id{MPSMatrixRandomDistributionDescriptor}
+    obj = MPSMatrixRandomDistributionDescriptor(desc)
+    # XXX: who releases this object?
+    return obj
+end
+
+function MPSMatrixRandomUniformDistributionDescriptor(minimum, maximum)
+    desc = @objc [MPSMatrixRandomDistributionDescriptor uniformDistributionDescriptorWithMinimum:minimum::Float32
+                                                        maximum:maximum::Float32]::id{MPSMatrixRandomDistributionDescriptor}
+    obj = MPSMatrixRandomDistributionDescriptor(desc)
+    # XXX: who releases this object?
+    return obj
+end
+
+
+@objcwrapper immutable=false MPSMatrixRandom <: MPSKernel
+
+@objcproperties MPSMatrixRandom begin
+    @autoproperty batchSize::NSUInteger
+    @autoproperty batchStart::NSUInteger
+    @autoproperty destinationDataType::id{MPSDataType}
+    @autoproperty distributionType::id{MPSMatrixRandomDistributionDescriptor}
+end
+
+function encode!(cmdbuf::MTLCommandBuffer, kernel::K, destinationMatrix::MPSMatrix) where {K<:MPSMatrixRandom}
+    @objc [kernel::id{K} encodeToCommandBuffer:cmdbuf::id{MTLCommandBuffer}
+                         destinationMatrix:destinationMatrix::id{MPSMatrix}]::Nothing
+end
+function encode!(cmdbuf::MTLCommandBuffer, kernel::K, destinationVector::MPSVector) where {K<:MPSMatrixRandom}
+    @objc [kernel::id{K} encodeToCommandBuffer:cmdbuf::id{MTLCommandBuffer}
+                         destinationVector:destinationVector::id{MPSVector}]::Nothing
+end
+
+@objcwrapper immutable=false MPSMatrixRandomMTGP32 <: MPSMatrixRandom
+@objcwrapper immutable=false MPSMatrixRandomPhilox <: MPSMatrixRandom
+
+for R in [:MPSMatrixRandomMTGP32, :MPSMatrixRandomPhilox]
+    @eval begin
+        function $R(device)
+            kernel = @objc [$R alloc]::id{$R}
+            obj = $R(kernel)
+            finalizer(release, obj)
+            @objc [obj::id{$R} initWithDevice:device::id{MTLDevice}]::id{$R}
+            return obj
+        end
+        function $R(device, destinationDataType, seed)
+            kernel = @objc [$R alloc]::id{$R}
+            obj = $R(kernel)
+            finalizer(release, obj)
+            @objc [obj::id{$R} initWithDevice:device::id{MTLDevice}
+                                destinationDataType:destinationDataType::MPSDataType
+                                seed:seed::NSUInteger]::id{$R}
+            return obj
+        end
+        function $R(device, destinationDataType, seed, distributionDescriptor)
+            kernel = @objc [$R alloc]::id{$R}
+            obj = $R(kernel)
+            finalizer(release, obj)
+            @objc [obj::id{$R} initWithDevice:device::id{MTLDevice}
+                                destinationDataType:destinationDataType::MPSDataType
+                                seed:seed::NSUInteger
+                                distributionDescriptor:distributionDescriptor::id{MPSMatrixRandomDistributionDescriptor}]::id{$R}
+            return obj
+        end
+    end
+end
+
+synchronizeStateOnCommandBuffer(kern::MPSMatrixRandomMTGP32, cmdbuf::MTLCommandBuffer) =
+    @objc [obj::id{MPSMatrixRandomMTGP32} synchronizeStateOnCommandBuffer:cmdbuf::id{MTLCommandBuffer}]::Nothing
+
+
+
+@inline function _mpsmat_rand!(randkern::MPSMatrixRandom, dest::MtlArray{T}, ::Type{T2};
+                        queue::MTLCommandQueue = global_queue(current_device()),
+                        async::Bool=false) where {T,T2}
+    byteoffset = dest.offset * sizeof(T)
+    (byteoffset % 4 == 0) || error(lazy"Destination buffer offset ($(byteoffset)) must be a multiple of 4.")
+
+    srcbytes = sizeof(dest)
+
+    cmdbuf = if srcbytes % 16 == 0 && dest.offset == 0
+        MTLCommandBuffer(queue) do cmdbuf
+            vecDesc = MPSVectorDescriptor(srcbytes ÷ sizeof(T2), T2)
+            mpsdest = MPSVector(dest, vecDesc)
+            encode!(cmdbuf, randkern, mpsdest)
+        end
+    else
+        MTLCommandBuffer(queue) do cmdbuf
+            len = UInt(ceil(srcbytes / sizeof(T2)) * 4)
+            vecDesc = MPSVectorDescriptor(len, T2)
+            tempVec = MPSTemporaryVector(cmdbuf, vecDesc)
+            encode!(cmdbuf, randkern, tempVec)
+            MTLBlitCommandEncoder(cmdbuf) do enc
+                MTL.append_copy!(enc, dest.data[], byteoffset, tempVec.data, tempVec.offset, srcbytes)
+            end
+        end
+    end
+
+    async || wait_completed(cmdbuf)
+    return
+end

From f7bac7b04b67278a7fe38b30c44d204faa890a2c Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:22:28 -0300
Subject: [PATCH 02/15] Support rand! and rand using MPS where appropriate

---
 docs/src/usage/array.md |  44 +++++++
 lib/mps/MPS.jl          |   1 +
 lib/mps/random.jl       | 109 ++++++++++++++++
 src/random.jl           |  61 +++++++--
 test/random.jl          | 282 +++++++++++++++++++++++++++++++++++-----
 5 files changed, 454 insertions(+), 43 deletions(-)
 create mode 100644 lib/mps/random.jl

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index 42e27db7b..0cf57f47a 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -3,6 +3,11 @@
 ```@meta
 DocTestSetup = quote
     using Metal
+
+    import Random
+    Random.seed!(0)
+
+    Metal.seed!(0)
 end
 ```
 
@@ -106,3 +111,42 @@ julia> Base.mapreducedim!(identity, +, b, a)
 1×1 MtlMatrix{Float32, Metal.PrivateStorage}:
  6.0
 ```
+
+## Random numbers
+
+Base's convenience functions for generating random numbers are available in Metal as well:
+
+```jldoctest
+julia> Metal.rand(2)
+2-element MtlVector{Float32, Private}:
+ 0.39904642
+ 0.8805201
+
+julia> Metal.randn(Float32, 2, 1)
+2×1 MtlMatrix{Float32, Private}:
+ -0.18797699
+ -0.006818078
+```
+
+Behind the scenes, these random numbers come from two different generators: one backed by
+[Metal Performance Shaders](https://developer.apple.com/documentation/metalperformanceshaders/mpsmatrixrandom?language=objc),
+another by using the GPUArrays.jl random methods. Operations on these generators are implemented using methods from the Random
+standard library:
+
+```jldoctest
+julia> using Random, GPUArrays
+
+julia> a = Random.rand(MPS.default_rng(), Float32, 1)
+1-element MtlVector{Float32, Private}:
+ 0.39904642
+
+julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
+1-element MtlVector{Float32, Private}:
+ 0.13394515
+```
+
+!!! note
+    `MPSMatrixRandom` functionality requires Metal.jl > v1.1
+
+!!! warning
+    Do not use `Random.rand!(::MPS.RNG, args...)` or `Random.randn!(::MPS.RNG, args...)` on views as you will most likely overwrite values outside of the view due to limitations in random number generation in the Metal Performance Shaders framework.
diff --git a/lib/mps/MPS.jl b/lib/mps/MPS.jl
index e33a156b9..7266eae9a 100644
--- a/lib/mps/MPS.jl
+++ b/lib/mps/MPS.jl
@@ -33,6 +33,7 @@ include("decomposition.jl")
 include("copy.jl")
 
 # integrations
+include("random.jl")
 include("linalg.jl")
 
 end
diff --git a/lib/mps/random.jl b/lib/mps/random.jl
new file mode 100644
index 000000000..e5b4c987d
--- /dev/null
+++ b/lib/mps/random.jl
@@ -0,0 +1,109 @@
+using Random
+using Metal: DefaultStorageMode
+
+"""
+    MPS.RNG()
+
+A random number generator using `rand()` in a device kernel.
+"""
+mutable struct RNG <: AbstractRNG
+    device::MTLDevice
+    uniformInteger::MPSMatrixRandomPhilox
+    uniformFloat32::MPSMatrixRandomPhilox
+    normalFloat32::MPSMatrixRandomPhilox
+end
+
+
+make_seed() = Base.rand(RandomDevice(), UInt)
+
+function RNG(device::MTLDevice, seed::Integer)
+    seed = seed%UInt
+    RNG(device,
+        MPSMatrixRandomPhilox(device, UInt32, seed, MPSMatrixRandomDefaultDistributionDescriptor()),
+        MPSMatrixRandomPhilox(device, Float32, seed, MPSMatrixRandomUniformDistributionDescriptor(0, 1)),
+        MPSMatrixRandomPhilox(device, Float32, seed, MPSMatrixRandomNormalDistributionDescriptor(0, 1)),)
+end
+@autoreleasepool RNG(seed::Integer) = RNG(current_device(), seed)
+RNG(device::MTLDevice) = RNG(device, make_seed())
+
+@autoreleasepool RNG() = RNG(current_device(), make_seed())
+
+Base.copy(rng::RNG) = RNG(copy(rng.device), copy(rng.uniformInteger), copy(rng.uniformFloat32), copy(rng.normalFloat32))
+
+@autoreleasepool function Random.seed!(rng::RNG, seed::Integer)
+    rng.uniformInteger = MPSMatrixRandomPhilox(rng.device, UInt32, seed, MPSMatrixRandomDefaultDistributionDescriptor())
+    rng.uniformFloat32 = MPSMatrixRandomPhilox(rng.device, Float32, seed, MPSMatrixRandomUniformDistributionDescriptor(0, 1))
+    rng.normalFloat32  = MPSMatrixRandomPhilox(rng.device, Float32, seed, MPSMatrixRandomNormalDistributionDescriptor(0, 1))
+    return rng
+end
+
+Random.seed!(rng::RNG) = Random.seed!(rng, make_seed())
+
+const GLOBAL_RNGs = Dict{MTLDevice,MPS.RNG}()
+@autoreleasepool function default_rng()
+    dev = current_device()
+    get!(GLOBAL_RNGs, dev) do
+        RNG(dev)
+    end
+end
+
+const UniformTypes = [Float32,UInt8,Int8,UInt16,Int16,UInt32,Int32,UInt64,Int64]
+const UniformType = Union{[Type{T} for T in UniformTypes]...}
+const UniformArray = MtlArray{<:Union{Float32,UInt8,Int8,UInt16,Int16,UInt32,Int32,UInt64,Int64}}
+@autoreleasepool function Random.rand!(rng::RNG, A::MtlArray{T}) where {T<:Union{UInt8,Int8,UInt16,Int16,UInt32,Int32,UInt64,Int64}}
+    isempty(A) && return A
+    _mpsmat_rand!(rng.uniformInteger, A, UInt32)
+    return A
+end
+
+@autoreleasepool function Random.rand!(rng::RNG, A::MtlArray{Float32})
+    isempty(A) && return A
+    _mpsmat_rand!(rng.uniformFloat32, A, Float32)
+    return A
+end
+
+const NormalType = Type{Float32}
+const NormalArray = MtlArray{<:Float32}
+@autoreleasepool function Random.randn!(rng::RNG, A::MtlArray{Float32})
+    isempty(A) && return A
+    _mpsmat_rand!(rng.normalFloat32, A, Float32)
+    return A
+end
+
+# CPU arrays
+function Random.rand!(rng::RNG, A::AbstractArray{T,N}) where {T <: Union{UniformTypes...}, N}
+    isempty(A) && return A
+    B = MtlArray{T,N,Shared}(undef, size(A))
+    rand!(rng, B)
+    copyto!(A, unsafe_wrap(Array{T},B))
+    return A
+end
+function Random.randn!(rng::RNG, A::AbstractArray{T,N}) where {T <: Float32, N}
+    isempty(A) && return A
+    B = MtlArray{T,N,Shared}(undef, size(A))
+    randn!(rng, B)
+    copyto!(A, unsafe_wrap(Array{T},B))
+    return A
+end
+
+# Out of place
+Random.rand(rng::RNG, T::UniformType, dims::Dims; storage=DefaultStorageMode) =
+    Random.rand!(rng, MtlArray{T,length(dims),storage}(undef, dims...))
+Random.randn(rng::RNG, T::NormalType, dims::Dims; storage=DefaultStorageMode) =
+    Random.randn!(rng, MtlArray{T,length(dims),storage}(undef, dims...))
+
+# support all dimension specifications
+Random.rand(rng::RNG, T::UniformType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.rand!(rng, MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+Random.randn(rng::RNG, T::NormalType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.randn!(rng, MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+
+# untyped out-of-place
+Random.rand(rng::RNG, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.rand!(rng, MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
+Random.randn(rng::RNG, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.randn!(rng, MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
+
+# scalars
+Random.rand(rng::RNG, T::UniformType=Float32; storage=Shared) = rand(rng, T, 1; storage)[]
+Random.randn(rng::RNG, T::NormalType=Float32; storage=Shared) = randn(rng, T, 1; storage)[]
diff --git a/src/random.jl b/src/random.jl
index 81cc48c00..bc6458253 100644
--- a/src/random.jl
+++ b/src/random.jl
@@ -1,24 +1,69 @@
 using Random
+using ..MPS: MPSVector, _mpsmat_rand!, MPSMatrixRandomUniformDistributionDescriptor,
+             MPSMatrixRandomNormalDistributionDescriptor
 
 gpuarrays_rng() = GPUArrays.default_rng(MtlArray)
+mpsrand_rng() = MPS.default_rng()
 
 # GPUArrays in-place
 Random.rand!(A::MtlArray) = Random.rand!(gpuarrays_rng(), A)
 Random.randn!(A::MtlArray) = Random.randn!(gpuarrays_rng(), A)
 
+@inline function can_use_mpsrandom(A::MtlArray{T}) where {T}
+    return A.offset * sizeof(T) % 4 == 0 && sizeof(A) % 4 == 0
+end
+
+# Use MPS random functionality where possible
+function Random.rand!(A::MPS.UniformArray)
+    if can_use_mpsrandom(A)
+        @inline Random.rand!(mpsrand_rng(), A)
+    else
+        @inline Random.rand!(gpuarrays_rng(), A)
+    end
+    return A
+end
+function Random.randn!(A::MPS.NormalArray)
+    if can_use_mpsrandom(A)
+        @inline Random.randn!(mpsrand_rng(), A)
+    else
+        @inline Random.randn!(gpuarrays_rng(), A)
+    end
+    return A
+end
+
 # GPUArrays out-of-place
-rand(T::Type, dims::Dims; storage=DefaultStorageMode) = Random.rand!(MtlArray{T,length(dims),storage}(undef, dims...))
-randn(T::Type, dims::Dims; storage=DefaultStorageMode, kwargs...) = Random.randn!(MtlArray{T,length(dims),storage}(undef, dims...); kwargs...)
+rand(T::MPS.UniformType, dims::Dims; storage=DefaultStorageMode) =
+    Random.rand!(mpsrand_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
+randn(T::MPS.NormalType, dims::Dims; storage=DefaultStorageMode) =
+    Random.randn!(mpsrand_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
+rand(T::Type, dims::Dims; storage=DefaultStorageMode) =
+    Random.rand!(gpuarrays_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
+randn(T::Type, dims::Dims; storage=DefaultStorageMode) =
+    Random.randn!(gpuarrays_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
 
 # support all dimension specifications
+rand(T::MPS.UniformType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.rand!(mpsrand_rng(), MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+randn(T::MPS.NormalType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.randn!(mpsrand_rng(), MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+
 rand(T::Type, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
-    Random.rand!(MtlArray{T,length(dims)+1,storage}(undef, dim1, dims...))
-randn(T::Type, dim1::Integer, dims::Integer...; storage=DefaultStorageMode, kwargs...) =
-    Random.randn!(MtlArray{T,length(dims)+1,storage}(undef, dim1, dims...); kwargs...)
+    Random.rand!(gpuarrays_rng(), MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+randn(T::Type, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.randn!(gpuarrays_rng(), MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
 
 # untyped out-of-place
-rand(dim1::Integer, dims::Integer...; storage=DefaultStorageMode) = Random.rand!(MtlArray{Float32,length(dims)+1,storage}(undef, dim1, dims...))
-randn(dim1::Integer, dims::Integer...; storage=DefaultStorageMode, kwargs...) = Random.randn!(MtlArray{Float32,length(dims)+1,storage}(undef, dim1, dims...); kwargs...)
+rand(dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.rand!(mpsrand_rng(), MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
+randn(dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
+    Random.randn!(mpsrand_rng(), MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
+
+# scalars
+rand(T::Type=Float32; storage=Shared) = rand(T, 1; storage)[]
+randn(T::Type=Float32; storage=Shared) = randn(T, 1; storage)[]
 
 # seeding
-seed!(seed=Base.rand(UInt64)) = Random.seed!(gpuarrays_rng(), seed)
+function seed!(seed=Base.rand(UInt64))
+    Random.seed!(gpuarrays_rng(), seed)
+    Random.seed!(mpsrand_rng(), seed)
+end
diff --git a/test/random.jl b/test/random.jl
index 89c771bca..a8f3c3186 100644
--- a/test/random.jl
+++ b/test/random.jl
@@ -1,39 +1,251 @@
 using Random
 
-@testset "rand" begin
-
-# in-place
-for (f,T) in ((rand!,Float16),
-              (rand!,Float32),
-              (randn!,Float16),
-              (randn!,Float32)),
-    d in (2, (2,2), (2,2,2), 3, (3,3), (3,3,3))
-    A = MtlArray{T}(undef, d)
-    fill!(A, T(0))
-    f(A)
-    @test !iszero(collect(A))
-end
-
-# out-of-place, with implicit type
-for (f,T) in ((Metal.rand,Float32), (Metal.randn,Float32)),
-    args in ((2,), (2, 2), (3,), (3, 3))
-    A = f(args...)
-    @test eltype(A) == T
-end
-
-# out-of-place, with type specified
-for (f,T) in ((Metal.rand,Float32), (Metal.randn,Float32),
-              (rand,Float32), (randn,Float32)),
-    args in ((T, 2), (T, 2, 2), (T, (2, 2)), (T, 3), (T, 3, 3), (T, (3, 3)))
-    A = f(args...)
-    @test eltype(A) == T
-end
-
-## seeding
-Metal.seed!(1)
-a = Metal.rand(Int32, 1)
-Metal.seed!(1)
-b = Metal.rand(Int32, 1)
-@test iszero(collect(a) - collect(b))
+const RAND_TYPES = [Float16, Float32, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64,
+                    UInt64]
+const RANDN_TYPES = [Float16, Float32]
+const INPLACE_TUPLES = [[(rand!, T) for T in RAND_TYPES];
+                        [(randn!, T) for T in RANDN_TYPES]]
+const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
+                        [(Metal.randn, rand, T) for T in RANDN_TYPES]]
 
+@testset "random" begin
+    # in-place
+    @testset "in-place" begin
+        rng = Metal.MPS.RNG()
+
+        # Seed the default generators to work around value of 0 being
+        #  randomly generated in the size 1 Int8 Array test in 1.11
+        Metal.seed!(123)
+
+        @testset "$f with $T" for (f, T) in INPLACE_TUPLES
+            @testset "$d" for d in (1, 3, (3, 3), (3, 3, 3), 16, (16, 16), (16, 16, 16), (1000,), (1000,1000))
+                A = MtlArray{T}(undef, d)
+
+                # default_rng
+                fill!(A, T(0))
+                f(A)
+                @test !iszero(collect(A))
+
+                # specified MPS rng
+                if T != Float16
+                    fill!(A, T(0))
+                    f(rng, A)
+                    @test !iszero(collect(A))
+                end
+            end
+
+            @testset "0" begin
+                A = MtlArray{T}(undef, 0)
+
+                # default_rng
+                f(A)
+                @test A isa MtlArray{T,1}
+                @test Array(A) == fill(1, 0)
+
+                # specified MPS rng
+                if T != Float16
+                    fill!(A, T(0))
+                    f(rng, A)
+                    @test Array(A) == fill(1, 0)
+                end
+            end
+        end
+    end
+
+    # in-place contiguous views
+    @testset "in-place for views" begin
+        @testset "$f with $T" for (f, T) in INPLACE_TUPLES
+            alen = 100
+            A = MtlArray{T}(undef, alen)
+            function test_view!(X::MtlArray{T}, idx) where {T}
+                fill!(X, T(0))
+                view_X = @view X[idx]
+                f(view_X)
+                cpuX = collect(X)
+                not_zero_in_view = !iszero(cpuX[idx])
+                rest_of_array_untouched = iszero(cpuX[1:alen .∉ Ref(idx)])
+                return not_zero_in_view, rest_of_array_untouched
+            end
+
+            # Test when view offset is 0 and buffer size not multiple of 4
+            @testset "Off == 0, buf % 4 != 0" begin
+                not_zero_in_view, rest_of_array_untouched = test_view!(A, 1:51)
+                @test not_zero_in_view
+                @test rest_of_array_untouched
+            end
+
+            # Test when view offset is 0 and buffer size is multiple of 16
+            @testset "Off == 0, buf % 16 == 0" begin
+                not_zero_in_view, rest_of_array_untouched = test_view!(A, 1:32)
+                @test not_zero_in_view
+                @test rest_of_array_untouched
+            end
+
+            # Test when view offset is 0 and buffer size is multiple of 4
+            @testset "Off == 0, buf % 4 == 0" begin
+                not_zero_in_view, rest_of_array_untouched = test_view!(A, 1:36)
+                @test not_zero_in_view
+                @test rest_of_array_untouched
+            end
+
+            # Test when view offset is not 0 nor multiple of 4 and buffer size not multiple of 16
+            @testset "Off != 0, buf % 4 != 0" begin
+                not_zero_in_view, rest_of_array_untouched = test_view!(A, 3:51)
+                @test not_zero_in_view
+                @test rest_of_array_untouched
+            end
+
+            # Test when view offset is multiple of 4 and buffer size not multiple of 4
+            @testset "Off % 4 == 0, buf % 4 != 0" begin
+                not_zero_in_view, rest_of_array_untouched = test_view!(A, 17:51)
+                @test not_zero_in_view
+                @test rest_of_array_untouched
+            end
+
+            # Test when view offset is multiple of 4 and buffer size multiple of 16
+            @testset "Off % 4 == 0, buf % 16 == 0" begin
+                not_zero_in_view, rest_of_array_untouched = test_view!(A, 9:40)
+                @test not_zero_in_view
+                @test rest_of_array_untouched
+            end
+
+            # Test when view offset is multiple of 4 and buffer size multiple of 4
+            @testset "Off % 16 == 0, buf % 4 == 0" begin
+                not_zero_in_view, rest_of_array_untouched = test_view!(A, 9:32)
+                @test not_zero_in_view
+                @test rest_of_array_untouched
+            end
+        end
+
+        # Test when views try to use rand!(rng, args..)
+        @testset "MPS.RNG with views" begin
+            rng = Metal.MPS.RNG()
+            @testset "$f with $T" for (f, T) in ((randn!, Float32),(rand!, Int64),(rand!, Float32), (rand!, UInt16), (rand!,Int8))
+                A = MtlArray{T}(undef, 100)
+
+                ## Offset > 0
+                fill!(A, T(0))
+                idx = 4:51
+                view_A = @view A[idx]
+
+                # Errors in Julia before crashing whole process
+                if view_A.offset * sizeof(T) % 4 != 0
+                    @test_throws "Destination buffer offset ($(view_A.offset*sizeof(T)))" f(rng, view_A)
+                else
+                    f(rng, view_A)
+
+                    cpuA = collect(A)
+                    @test !iszero(cpuA[idx])
+
+                    @test iszero(cpuA[1:100 .∉ Ref(idx)]) broken=(sizeof(view_A) % 4 != 0)
+                end
+
+                ## Offset == 0
+                fill!(A, T(0))
+                idx = 1:51
+                view_A = @view A[idx]
+                f(rng, view_A)
+
+                cpuA = collect(A)
+                @test !iszero(cpuA[idx])
+
+                # XXX: Why are the 8-bit and 16-bit type tests not broken?
+                @test iszero(cpuA[1:100 .∉ Ref(idx)])# broken=(sizeof(view_A) % 4 != 0)
+            end
+        end
+    end
+    # out-of-place
+    @testset "out-of-place" begin
+        @testset "$fr with implicit type" for (fm, fr, T) in
+                                             ((Metal.rand, rand, Float32), (Metal.randn, rand, Float32))
+            rng = Metal.MPS.RNG()
+            @testset "args" for args in ((0,), (1,), (3,), (3, 3), (16,), (16, 16), (1000,), (1000,1000))
+                # default_rng
+                A = fm(args...)
+                @test eltype(A) == T
+
+                # specified MPS rng
+                B = fr(rng, args...)
+                @test eltype(B) == T
+            end
+
+            @testset "scalar" begin
+                a = fm()
+                @test typeof(a) == T
+                b = fr(rng)
+                @test typeof(b) == T
+            end
+        end
+
+        # out-of-place, with type specified
+        @testset "$fr with $T" for (fm, fr, T) in OOPLACE_TUPLES
+            rng = Metal.MPS.RNG()
+            @testset "$args" for args in ((T, 0),
+                                          (T, 1),
+                                          (T, 3),
+                                          (T, 3, 3),
+                                          (T, (3, 3)),
+                                          (T, 16),
+                                          (T, 16, 16),
+                                          (T, (16, 16)),
+                                          (T, 1000),
+                                          (T, 1000, 1000),)
+                # default_rng
+                A = fm(args...)
+                @test eltype(A) == T
+
+                # specified MPS rng
+                if T != Float16
+                    B = fr(rng, args...)
+                    @test eltype(B) == T
+                end
+            end
+
+            @testset "scalar" begin
+                a = fm(T)
+                @test typeof(a) == T
+                b = fr(rng, T)
+                @test typeof(b) == T
+            end
+        end
+    end
+
+    ## CPU Arrays with MPS rng
+    @testset "CPU Arrays" begin
+        MPS_TUPLES = filter(INPLACE_TUPLES) do tup
+            tup[2] != Float16
+        end
+        rng = Metal.MPS.RNG()
+        @testset "$f with $T" for (f, T) in MPS_TUPLES
+
+            @testset "$d" for d in (1, 3, (3, 3), (3, 3, 3), 16, (16, 16), (16, 16, 16), (1000,), (1000,1000))
+                A = zeros(T, d)
+                f(rng, A)
+                @test !iszero(collect(A))
+            end
+
+            @testset "0" begin
+                A = rand(T, 0)
+                b = rand(T)
+                fill!(A, b)
+                @test A isa Array{T,1}
+                @test Array(A) == fill(b, 0)
+            end
+        end
+    end
+
+    ## seeding
+    @testset "Seeding $L" for (f,T,L) in [(Metal.rand,UInt32,"Uniform Integers MPS"),
+                                          (Metal.rand,Float32,"Uniform Float32 MPS"),
+                                          (Metal.randn,Float32,"Normal Float32 MPS"),
+                                          (Metal.randn,Float16,"Float16 GPUArrays")]
+        @testset "$d" for d in (1, 3, (3, 3, 3), 16, (16, 16), (16, 16, 16), (1000,), (1000,1000))
+            Metal.seed!(1)
+            a = f(T, d)
+            Metal.seed!(1)
+            b = f(T, d)
+            # TODO: Remove once https://github.com/JuliaGPU/Metal.jl/issues/331 is fixed
+            @test iszero(collect(a) - collect(b)) broken = (T == Float16 && d == (1000,1000))
+        end
+    end
 end # testset

From e129b39c60b9ef66378dd0562ac49aab524cfe25 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 21 May 2024 13:27:03 -0300
Subject: [PATCH 03/15] Comply with Metal documentation by preventing copies
 between buffer sizes that are not divisible by 4

---
 docs/src/usage/array.md |  2 +-
 lib/mps/matrixrandom.jl | 17 +++++-----
 lib/mps/random.jl       |  4 +--
 src/random.jl           | 44 +++++++++++++-------------
 test/random.jl          | 69 +++++++++++++++++++++++------------------
 5 files changed, 74 insertions(+), 62 deletions(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index 0cf57f47a..dc1a44e10 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -149,4 +149,4 @@ julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
     `MPSMatrixRandom` functionality requires Metal.jl > v1.1
 
 !!! warning
-    Do not use `Random.rand!(::MPS.RNG, args...)` or `Random.randn!(::MPS.RNG, args...)` on views as you will most likely overwrite values outside of the view due to limitations in random number generation in the Metal Performance Shaders framework.
+    `Random.rand!(::MPS.RNG, args...)` andc `Random.randn!(::MPS.RNG, args...)` have a framework limitation that requires the byte offset and byte size of the destination array to be a multiple of 4.
diff --git a/lib/mps/matrixrandom.jl b/lib/mps/matrixrandom.jl
index 0d5eb8741..6c624021b 100644
--- a/lib/mps/matrixrandom.jl
+++ b/lib/mps/matrixrandom.jl
@@ -114,29 +114,32 @@ synchronizeStateOnCommandBuffer(kern::MPSMatrixRandomMTGP32, cmdbuf::MTLCommandB
     @objc [obj::id{MPSMatrixRandomMTGP32} synchronizeStateOnCommandBuffer:cmdbuf::id{MTLCommandBuffer}]::Nothing
 
 
-
 @inline function _mpsmat_rand!(randkern::MPSMatrixRandom, dest::MtlArray{T}, ::Type{T2};
                         queue::MTLCommandQueue = global_queue(current_device()),
                         async::Bool=false) where {T,T2}
     byteoffset = dest.offset * sizeof(T)
-    (byteoffset % 4 == 0) || error(lazy"Destination buffer offset ($(byteoffset)) must be a multiple of 4.")
+    bytesize = sizeof(dest)
 
-    srcbytes = sizeof(dest)
+    # Even though `append_copy`` seems to work with any size or offset values, the documentation at
+    # https://developer.apple.com/documentation/metal/mtlblitcommandencoder/1400767-copyfrombuffer?language=objc
+    # mentions that both must be multiples of 4 bytes in MacOS so error when they are not
+    (bytesize % 4 == 0) || error(lazy"Destination buffer bytesize ($(bytesize)) must be a multiple of 4.")
+    (byteoffset % 4 == 0) || error(lazy"Destination buffer offset ($(byteoffset)) must be a multiple of 4.")
 
-    cmdbuf = if srcbytes % 16 == 0 && dest.offset == 0
+    cmdbuf = if bytesize % 16 == 0 && dest.offset == 0
         MTLCommandBuffer(queue) do cmdbuf
-            vecDesc = MPSVectorDescriptor(srcbytes ÷ sizeof(T2), T2)
+            vecDesc = MPSVectorDescriptor(bytesize ÷ sizeof(T2), T2)
             mpsdest = MPSVector(dest, vecDesc)
             encode!(cmdbuf, randkern, mpsdest)
         end
     else
         MTLCommandBuffer(queue) do cmdbuf
-            len = UInt(ceil(srcbytes / sizeof(T2)) * 4)
+            len = UInt(ceil(bytesize / sizeof(T2)) * 4)
             vecDesc = MPSVectorDescriptor(len, T2)
             tempVec = MPSTemporaryVector(cmdbuf, vecDesc)
             encode!(cmdbuf, randkern, tempVec)
             MTLBlitCommandEncoder(cmdbuf) do enc
-                MTL.append_copy!(enc, dest.data[], byteoffset, tempVec.data, tempVec.offset, srcbytes)
+                MTL.append_copy!(enc, dest.data[], byteoffset, tempVec.data, tempVec.offset, bytesize)
             end
         end
     end
diff --git a/lib/mps/random.jl b/lib/mps/random.jl
index e5b4c987d..7287a1471 100644
--- a/lib/mps/random.jl
+++ b/lib/mps/random.jl
@@ -105,5 +105,5 @@ Random.randn(rng::RNG, dim1::Integer, dims::Integer...; storage=DefaultStorageMo
     Random.randn!(rng, MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
 
 # scalars
-Random.rand(rng::RNG, T::UniformType=Float32; storage=Shared) = rand(rng, T, 1; storage)[]
-Random.randn(rng::RNG, T::NormalType=Float32; storage=Shared) = randn(rng, T, 1; storage)[]
+Random.rand(rng::RNG, T::UniformType=Float32; storage=Shared) = rand(rng, T, 4; storage)[1]
+Random.randn(rng::RNG, T::NormalType=Float32; storage=Shared) = randn(rng, T, 4; storage)[1]
diff --git a/src/random.jl b/src/random.jl
index bc6458253..25783dcc7 100644
--- a/src/random.jl
+++ b/src/random.jl
@@ -15,37 +15,37 @@ end
 
 # Use MPS random functionality where possible
 function Random.rand!(A::MPS.UniformArray)
-    if can_use_mpsrandom(A)
-        @inline Random.rand!(mpsrand_rng(), A)
-    else
-        @inline Random.rand!(gpuarrays_rng(), A)
-    end
-    return A
+    rng = can_use_mpsrandom(A) ? mpsrand_rng() : gpuarrays_rng()
+    return Random.rand!(rng, A)
 end
 function Random.randn!(A::MPS.NormalArray)
-    if can_use_mpsrandom(A)
-        @inline Random.randn!(mpsrand_rng(), A)
-    else
-        @inline Random.randn!(gpuarrays_rng(), A)
-    end
-    return A
+    rng = can_use_mpsrandom(A) ? mpsrand_rng() : gpuarrays_rng()
+    return Random.randn!(rng, A)
 end
 
 # GPUArrays out-of-place
-rand(T::MPS.UniformType, dims::Dims; storage=DefaultStorageMode) =
-    Random.rand!(mpsrand_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
-randn(T::MPS.NormalType, dims::Dims; storage=DefaultStorageMode) =
-    Random.randn!(mpsrand_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
+function rand(T::MPS.UniformType, dims::Dims; storage=DefaultStorageMode)
+    rng =  prod(dims) * sizeof(T) % 4 == 0 ? mpsrand_rng() : gpuarrays_rng()
+    return Random.rand!(rng, MtlArray{T,length(dims),storage}(undef, dims...))
+end
+function randn(T::MPS.NormalType, dims::Dims; storage=DefaultStorageMode)
+    rng =  prod(dims) * sizeof(T) % 4 == 0 ? mpsrand_rng() : gpuarrays_rng()
+    return Random.randn!(rng, MtlArray{T,length(dims),storage}(undef, dims...))
+end
 rand(T::Type, dims::Dims; storage=DefaultStorageMode) =
     Random.rand!(gpuarrays_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
 randn(T::Type, dims::Dims; storage=DefaultStorageMode) =
     Random.randn!(gpuarrays_rng(), MtlArray{T,length(dims),storage}(undef, dims...))
 
 # support all dimension specifications
-rand(T::MPS.UniformType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
-    Random.rand!(mpsrand_rng(), MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
-randn(T::MPS.NormalType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
-    Random.randn!(mpsrand_rng(), MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+function rand(T::MPS.UniformType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode)
+    rng = (dim1 * prod(dims) * sizeof(T)) % 4 == 0 ? mpsrand_rng() : gpuarrays_rng()
+    return Random.rand!(rng, MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+end
+function randn(T::MPS.NormalType, dim1::Integer, dims::Integer...; storage=DefaultStorageMode)
+    rng = (dim1 * prod(dims) * sizeof(T)) % 4 == 0 ? mpsrand_rng() : gpuarrays_rng()
+    return Random.randn!(rng, MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
+end
 
 rand(T::Type, dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
     Random.rand!(gpuarrays_rng(), MtlArray{T,length(dims) + 1,storage}(undef, dim1, dims...))
@@ -59,8 +59,8 @@ randn(dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
     Random.randn!(mpsrand_rng(), MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
 
 # scalars
-rand(T::Type=Float32; storage=Shared) = rand(T, 1; storage)[]
-randn(T::Type=Float32; storage=Shared) = randn(T, 1; storage)[]
+rand(T::Type=Float32; storage=Shared) = rand(T, 4; storage)[1]
+randn(T::Type=Float32; storage=Shared) = randn(T, 4; storage)[1]
 
 # seeding
 function seed!(seed=Base.rand(UInt64))
diff --git a/test/random.jl b/test/random.jl
index a8f3c3186..f0e94edef 100644
--- a/test/random.jl
+++ b/test/random.jl
@@ -1,4 +1,5 @@
 using Random
+using Metal: can_use_mpsrandom
 
 const RAND_TYPES = [Float16, Float32, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64,
                     UInt64]
@@ -29,8 +30,12 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
                 # specified MPS rng
                 if T != Float16
                     fill!(A, T(0))
-                    f(rng, A)
-                    @test !iszero(collect(A))
+                    if can_use_mpsrandom(A)
+                        f(rng, A)
+                        @test !iszero(collect(A))
+                    else
+                        @test_throws "Destination buffer" f(rng, A)
+                    end
                 end
             end
 
@@ -45,8 +50,12 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
                 # specified MPS rng
                 if T != Float16
                     fill!(A, T(0))
-                    f(rng, A)
-                    @test Array(A) == fill(1, 0)
+                    if can_use_mpsrandom(A)
+                        f(rng, A)
+                        @test Array(A) == fill(1, 0)
+                    else
+                        @test_throws "Destination buffer" f(rng, A)
+                    end
                 end
             end
         end
@@ -125,32 +134,33 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
 
                 ## Offset > 0
                 fill!(A, T(0))
-                idx = 4:51
+                idx = 4:50
                 view_A = @view A[idx]
 
                 # Errors in Julia before crashing whole process
-                if view_A.offset * sizeof(T) % 4 != 0
-                    @test_throws "Destination buffer offset ($(view_A.offset*sizeof(T)))" f(rng, view_A)
-                else
+                if can_use_mpsrandom(view_A)
                     f(rng, view_A)
 
                     cpuA = collect(A)
                     @test !iszero(cpuA[idx])
-
                     @test iszero(cpuA[1:100 .∉ Ref(idx)]) broken=(sizeof(view_A) % 4 != 0)
+                else
+                    @test_throws "Destination buffer" f(rng, view_A)
                 end
 
                 ## Offset == 0
                 fill!(A, T(0))
                 idx = 1:51
                 view_A = @view A[idx]
-                f(rng, view_A)
-
-                cpuA = collect(A)
-                @test !iszero(cpuA[idx])
+                if can_use_mpsrandom(view_A)
+                    f(rng, view_A)
 
-                # XXX: Why are the 8-bit and 16-bit type tests not broken?
-                @test iszero(cpuA[1:100 .∉ Ref(idx)])# broken=(sizeof(view_A) % 4 != 0)
+                    cpuA = collect(A)
+                    @test !iszero(cpuA[idx])
+                    @test iszero(cpuA[1:100 .∉ Ref(idx)])
+                else
+                    @test_throws "Destination buffer" f(rng, view_A)
+                end
             end
         end
     end
@@ -196,8 +206,12 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
 
                 # specified MPS rng
                 if T != Float16
-                    B = fr(rng, args...)
-                    @test eltype(B) == T
+                    if length(zeros(args...)) * sizeof(T) % 4 == 0
+                        B = fr(rng, args...)
+                        @test eltype(B) == T
+                    else
+                        @test_throws "Destination buffer" fr(rng, args...)
+                    end
                 end
             end
 
@@ -212,24 +226,19 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
 
     ## CPU Arrays with MPS rng
     @testset "CPU Arrays" begin
-        MPS_TUPLES = filter(INPLACE_TUPLES) do tup
+        mps_tuples = filter(INPLACE_TUPLES) do tup
             tup[2] != Float16
         end
         rng = Metal.MPS.RNG()
-        @testset "$f with $T" for (f, T) in MPS_TUPLES
-
+        @testset "$f with $T" for (f, T) in mps_tuples
             @testset "$d" for d in (1, 3, (3, 3), (3, 3, 3), 16, (16, 16), (16, 16, 16), (1000,), (1000,1000))
                 A = zeros(T, d)
-                f(rng, A)
-                @test !iszero(collect(A))
-            end
-
-            @testset "0" begin
-                A = rand(T, 0)
-                b = rand(T)
-                fill!(A, b)
-                @test A isa Array{T,1}
-                @test Array(A) == fill(b, 0)
+                if (prod(d) * sizeof(T)) % 4 == 0
+                    f(rng, A)
+                    @test !iszero(collect(A))
+                else
+                    @test_throws "Destination buffer" f(rng, A)
+                end
             end
         end
     end

From 29b56a4a726ea6c433ca0896a2b17c9a5ee4ddee Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 24 May 2024 12:55:18 -0300
Subject: [PATCH 04/15] Address review comments

---
 docs/src/usage/array.md | 2 +-
 lib/mps/matrixrandom.jl | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index dc1a44e10..40b53966e 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -149,4 +149,4 @@ julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
     `MPSMatrixRandom` functionality requires Metal.jl > v1.1
 
 !!! warning
-    `Random.rand!(::MPS.RNG, args...)` andc `Random.randn!(::MPS.RNG, args...)` have a framework limitation that requires the byte offset and byte size of the destination array to be a multiple of 4.
+    `Random.rand!(::MPS.RNG, args...)` and `Random.randn!(::MPS.RNG, args...)` have a framework limitation that requires the byte offset and byte size of the destination array to be a multiple of 4.
diff --git a/lib/mps/matrixrandom.jl b/lib/mps/matrixrandom.jl
index 6c624021b..10366c26b 100644
--- a/lib/mps/matrixrandom.jl
+++ b/lib/mps/matrixrandom.jl
@@ -24,7 +24,7 @@ end
 function MPSMatrixRandomDefaultDistributionDescriptor()
     desc = @objc [MPSMatrixRandomDistributionDescriptor defaultDistributionDescriptor]::id{MPSMatrixRandomDistributionDescriptor}
     obj = MPSMatrixRandomDistributionDescriptor(desc)
-    # XXX: who releases this object?
+    finalizer(release, obj)
     return obj
 end
 
@@ -35,7 +35,6 @@ function MPSMatrixRandomNormalDistributionDescriptor(mean, standardDeviation)
     desc = @objc [MPSMatrixRandomDistributionDescriptor normalDistributionDescriptorWithMean:mean::Float32
                                                         standardDeviation:standardDeviation::Float32]::id{MPSMatrixRandomDistributionDescriptor}
     obj = MPSMatrixRandomDistributionDescriptor(desc)
-    # XXX: who releases this object?
     return obj
 end
 
@@ -45,7 +44,6 @@ function MPSMatrixRandomNormalDistributionDescriptor(mean, standardDeviation, mi
                                                         minimum:minimum::Float32
                                                         maximum:maximum::Float32]::id{MPSMatrixRandomDistributionDescriptor}
     obj = MPSMatrixRandomDistributionDescriptor(desc)
-    # XXX: who releases this object?
     return obj
 end
 
@@ -53,7 +51,6 @@ function MPSMatrixRandomUniformDistributionDescriptor(minimum, maximum)
     desc = @objc [MPSMatrixRandomDistributionDescriptor uniformDistributionDescriptorWithMinimum:minimum::Float32
                                                         maximum:maximum::Float32]::id{MPSMatrixRandomDistributionDescriptor}
     obj = MPSMatrixRandomDistributionDescriptor(desc)
-    # XXX: who releases this object?
     return obj
 end
 

From 423569784eaf65ceb385849b7fadc91314c20689 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 24 May 2024 13:13:28 -0300
Subject: [PATCH 05/15] Update compat note

---
 docs/src/usage/array.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index 40b53966e..74e0e4c34 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -146,7 +146,7 @@ julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
 ```
 
 !!! note
-    `MPSMatrixRandom` functionality requires Metal.jl > v1.1
+    `MPSMatrixRandom` functionality requires Metal.jl >= v1.2
 
 !!! warning
     `Random.rand!(::MPS.RNG, args...)` and `Random.randn!(::MPS.RNG, args...)` have a framework limitation that requires the byte offset and byte size of the destination array to be a multiple of 4.

From 2595914143215c4faac65dd7953737879110ca78 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 30 May 2024 10:34:12 -0300
Subject: [PATCH 06/15] Remove reliance of gpu random functionality in copy
 tests.

---
 test/mps/copy.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/mps/copy.jl b/test/mps/copy.jl
index ac20f533e..0a7bef801 100644
--- a/test/mps/copy.jl
+++ b/test/mps/copy.jl
@@ -33,7 +33,8 @@ end
     Ts = Ts[.!(Ts .<: IGNORE_UNION)]
     @testset "$T" for T in Ts
         for dim in ((16,16), (10,500), (500,10), (256,512))
-            srcMat = Metal.rand(T, dim)
+
+            srcMat = MtlArray(rand(T, dim))
 
             dstMat = copytest(srcMat, false, false)
             @test dstMat == srcMat

From c4771382d26cf675c93b167269e05563a201b8f4 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Jun 2024 15:39:23 -0300
Subject: [PATCH 07/15] Fix segmentation fault

---
 lib/mps/matrixrandom.jl | 1 -
 test/random.jl          | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/mps/matrixrandom.jl b/lib/mps/matrixrandom.jl
index 10366c26b..da41a705f 100644
--- a/lib/mps/matrixrandom.jl
+++ b/lib/mps/matrixrandom.jl
@@ -24,7 +24,6 @@ end
 function MPSMatrixRandomDefaultDistributionDescriptor()
     desc = @objc [MPSMatrixRandomDistributionDescriptor defaultDistributionDescriptor]::id{MPSMatrixRandomDistributionDescriptor}
     obj = MPSMatrixRandomDistributionDescriptor(desc)
-    finalizer(release, obj)
     return obj
 end
 
diff --git a/test/random.jl b/test/random.jl
index f0e94edef..40aab582a 100644
--- a/test/random.jl
+++ b/test/random.jl
@@ -1,4 +1,5 @@
 using Random
+using Metal
 using Metal: can_use_mpsrandom
 
 const RAND_TYPES = [Float16, Float32, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64,

From 65067a0939dfe560451b61ab2e807e3b83c684da Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Jun 2024 13:11:33 -0300
Subject: [PATCH 08/15] Fix doctests

---
 docs/src/usage/array.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index 74e0e4c34..2b13322b4 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -3,11 +3,12 @@
 ```@meta
 DocTestSetup = quote
     using Metal
+    using GPUArrays
 
     import Random
-    Random.seed!(0)
+    Random.seed!(1)
 
-    Metal.seed!(0)
+    Metal.seed!(1)
 end
 ```
 
@@ -119,13 +120,13 @@ Base's convenience functions for generating random numbers are available in Meta
 ```jldoctest
 julia> Metal.rand(2)
 2-element MtlVector{Float32, Private}:
- 0.39904642
- 0.8805201
+ 0.89025915
+ 0.8946847
 
 julia> Metal.randn(Float32, 2, 1)
 2×1 MtlMatrix{Float32, Private}:
- -0.18797699
- -0.006818078
+ 1.2279074
+ 1.2518331
 ```
 
 Behind the scenes, these random numbers come from two different generators: one backed by
@@ -138,11 +139,11 @@ julia> using Random, GPUArrays
 
 julia> a = Random.rand(MPS.default_rng(), Float32, 1)
 1-element MtlVector{Float32, Private}:
- 0.39904642
+ 0.89025915
 
 julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
 1-element MtlVector{Float32, Private}:
- 0.13394515
+ 0.0705002
 ```
 
 !!! note

From c952493677a81d82729f6e81437394902a8a19fc Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 13 Jul 2024 18:18:37 -0300
Subject: [PATCH 09/15] Docstring

---
 docs/src/usage/array.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index 2b13322b4..27c865c12 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -147,7 +147,7 @@ julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
 ```
 
 !!! note
-    `MPSMatrixRandom` functionality requires Metal.jl >= v1.2
+    `MPSMatrixRandom` functionality requires Metal.jl >= v1.3
 
 !!! warning
     `Random.rand!(::MPS.RNG, args...)` and `Random.randn!(::MPS.RNG, args...)` have a framework limitation that requires the byte offset and byte size of the destination array to be a multiple of 4.

From 47cbf7e7ea28e4a7d4d61ddfb055f09ffa88473f Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 16 Jul 2024 16:00:33 -0300
Subject: [PATCH 10/15] `current_device()` -> `device()`

---
 lib/mps/matrixrandom.jl | 2 +-
 lib/mps/random.jl       | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/mps/matrixrandom.jl b/lib/mps/matrixrandom.jl
index da41a705f..bd0e602bd 100644
--- a/lib/mps/matrixrandom.jl
+++ b/lib/mps/matrixrandom.jl
@@ -111,7 +111,7 @@ synchronizeStateOnCommandBuffer(kern::MPSMatrixRandomMTGP32, cmdbuf::MTLCommandB
 
 
 @inline function _mpsmat_rand!(randkern::MPSMatrixRandom, dest::MtlArray{T}, ::Type{T2};
-                        queue::MTLCommandQueue = global_queue(current_device()),
+                        queue::MTLCommandQueue = global_queue(device()),
                         async::Bool=false) where {T,T2}
     byteoffset = dest.offset * sizeof(T)
     bytesize = sizeof(dest)
diff --git a/lib/mps/random.jl b/lib/mps/random.jl
index 7287a1471..7c7982bbe 100644
--- a/lib/mps/random.jl
+++ b/lib/mps/random.jl
@@ -23,10 +23,10 @@ function RNG(device::MTLDevice, seed::Integer)
         MPSMatrixRandomPhilox(device, Float32, seed, MPSMatrixRandomUniformDistributionDescriptor(0, 1)),
         MPSMatrixRandomPhilox(device, Float32, seed, MPSMatrixRandomNormalDistributionDescriptor(0, 1)),)
 end
-@autoreleasepool RNG(seed::Integer) = RNG(current_device(), seed)
+@autoreleasepool RNG(seed::Integer) = RNG(device(), seed)
 RNG(device::MTLDevice) = RNG(device, make_seed())
 
-@autoreleasepool RNG() = RNG(current_device(), make_seed())
+@autoreleasepool RNG() = RNG(device(), make_seed())
 
 Base.copy(rng::RNG) = RNG(copy(rng.device), copy(rng.uniformInteger), copy(rng.uniformFloat32), copy(rng.normalFloat32))
 
@@ -41,7 +41,7 @@ Random.seed!(rng::RNG) = Random.seed!(rng, make_seed())
 
 const GLOBAL_RNGs = Dict{MTLDevice,MPS.RNG}()
 @autoreleasepool function default_rng()
-    dev = current_device()
+    dev = device()
     get!(GLOBAL_RNGs, dev) do
         RNG(dev)
     end

From 0220ee5d441bce6b213466158e958597e504eb6b Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:45:36 -0300
Subject: [PATCH 11/15] Adapt to storage changes

---
 lib/mps/random.jl | 8 ++++----
 src/random.jl     | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/mps/random.jl b/lib/mps/random.jl
index 7c7982bbe..81ce58585 100644
--- a/lib/mps/random.jl
+++ b/lib/mps/random.jl
@@ -73,14 +73,14 @@ end
 # CPU arrays
 function Random.rand!(rng::RNG, A::AbstractArray{T,N}) where {T <: Union{UniformTypes...}, N}
     isempty(A) && return A
-    B = MtlArray{T,N,Shared}(undef, size(A))
+    B = MtlArray{T,N,SharedStorage}(undef, size(A))
     rand!(rng, B)
     copyto!(A, unsafe_wrap(Array{T},B))
     return A
 end
 function Random.randn!(rng::RNG, A::AbstractArray{T,N}) where {T <: Float32, N}
     isempty(A) && return A
-    B = MtlArray{T,N,Shared}(undef, size(A))
+    B = MtlArray{T,N,SharedStorage}(undef, size(A))
     randn!(rng, B)
     copyto!(A, unsafe_wrap(Array{T},B))
     return A
@@ -105,5 +105,5 @@ Random.randn(rng::RNG, dim1::Integer, dims::Integer...; storage=DefaultStorageMo
     Random.randn!(rng, MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
 
 # scalars
-Random.rand(rng::RNG, T::UniformType=Float32; storage=Shared) = rand(rng, T, 4; storage)[1]
-Random.randn(rng::RNG, T::NormalType=Float32; storage=Shared) = randn(rng, T, 4; storage)[1]
+Random.rand(rng::RNG, T::UniformType=Float32; storage=SharedStorage) = rand(rng, T, 4; storage)[1]
+Random.randn(rng::RNG, T::NormalType=Float32; storage=SharedStorage) = randn(rng, T, 4; storage)[1]
diff --git a/src/random.jl b/src/random.jl
index 25783dcc7..acb456766 100644
--- a/src/random.jl
+++ b/src/random.jl
@@ -59,8 +59,8 @@ randn(dim1::Integer, dims::Integer...; storage=DefaultStorageMode) =
     Random.randn!(mpsrand_rng(), MtlArray{Float32,length(dims) + 1,storage}(undef, dim1, dims...))
 
 # scalars
-rand(T::Type=Float32; storage=Shared) = rand(T, 4; storage)[1]
-randn(T::Type=Float32; storage=Shared) = randn(T, 4; storage)[1]
+rand(T::Type=Float32; storage=SharedStorage) = rand(T, 4; storage)[1]
+randn(T::Type=Float32; storage=SharedStorage) = randn(T, 4; storage)[1]
 
 # seeding
 function seed!(seed=Base.rand(UInt64))

From 05f4752cfcf39eac4a8331f646ff415ff3ee9751 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 24 Jul 2024 09:08:13 -0300
Subject: [PATCH 12/15] Update docs and clean up tests

---
 docs/src/usage/array.md | 10 +++++-----
 test/random.jl          | 12 ++++--------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index 27c865c12..bd1a9832e 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -119,12 +119,12 @@ Base's convenience functions for generating random numbers are available in Meta
 
 ```jldoctest
 julia> Metal.rand(2)
-2-element MtlVector{Float32, Private}:
+2-element MtlVector{Float32, Metal.PrivateStorage}:
  0.89025915
  0.8946847
 
 julia> Metal.randn(Float32, 2, 1)
-2×1 MtlMatrix{Float32, Private}:
+2×1 MtlMatrix{Float32, Metal.PrivateStorage}:
  1.2279074
  1.2518331
 ```
@@ -138,16 +138,16 @@ standard library:
 julia> using Random, GPUArrays
 
 julia> a = Random.rand(MPS.default_rng(), Float32, 1)
-1-element MtlVector{Float32, Private}:
+1-element MtlVector{Float32, Metal.PrivateStorage}:
  0.89025915
 
 julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
-1-element MtlVector{Float32, Private}:
+1-element MtlVector{Float32, Metal.PrivateStorage}:
  0.0705002
 ```
 
 !!! note
-    `MPSMatrixRandom` functionality requires Metal.jl >= v1.3
+    `MPSMatrixRandom` functionality requires Metal.jl >= v2.0
 
 !!! warning
     `Random.rand!(::MPS.RNG, args...)` and `Random.randn!(::MPS.RNG, args...)` have a framework limitation that requires the byte offset and byte size of the destination array to be a multiple of 4.
diff --git a/test/random.jl b/test/random.jl
index 40aab582a..8889de8ad 100644
--- a/test/random.jl
+++ b/test/random.jl
@@ -1,7 +1,3 @@
-using Random
-using Metal
-using Metal: can_use_mpsrandom
-
 const RAND_TYPES = [Float16, Float32, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64,
                     UInt64]
 const RANDN_TYPES = [Float16, Float32]
@@ -31,7 +27,7 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
                 # specified MPS rng
                 if T != Float16
                     fill!(A, T(0))
-                    if can_use_mpsrandom(A)
+                    if Metal.can_use_mpsrandom(A)
                         f(rng, A)
                         @test !iszero(collect(A))
                     else
@@ -51,7 +47,7 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
                 # specified MPS rng
                 if T != Float16
                     fill!(A, T(0))
-                    if can_use_mpsrandom(A)
+                    if Metal.can_use_mpsrandom(A)
                         f(rng, A)
                         @test Array(A) == fill(1, 0)
                     else
@@ -139,7 +135,7 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
                 view_A = @view A[idx]
 
                 # Errors in Julia before crashing whole process
-                if can_use_mpsrandom(view_A)
+                if Metal.can_use_mpsrandom(view_A)
                     f(rng, view_A)
 
                     cpuA = collect(A)
@@ -153,7 +149,7 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
                 fill!(A, T(0))
                 idx = 1:51
                 view_A = @view A[idx]
-                if can_use_mpsrandom(view_A)
+                if Metal.can_use_mpsrandom(view_A)
                     f(rng, view_A)
 
                     cpuA = collect(A)

From 69f73ce2843f75c7a2f82b7a727ce20c6ad2b2fc Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 27 Aug 2024 11:20:30 +0200
Subject: [PATCH 13/15] NFC.

---
 lib/mps/matrixrandom.jl | 2 +-
 test/mps/copy.jl        | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/mps/matrixrandom.jl b/lib/mps/matrixrandom.jl
index bd0e602bd..f166159de 100644
--- a/lib/mps/matrixrandom.jl
+++ b/lib/mps/matrixrandom.jl
@@ -106,7 +106,7 @@ for R in [:MPSMatrixRandomMTGP32, :MPSMatrixRandomPhilox]
     end
 end
 
-synchronizeStateOnCommandBuffer(kern::MPSMatrixRandomMTGP32, cmdbuf::MTLCommandBuffer) =
+synchronize_state(kern::MPSMatrixRandomMTGP32, cmdbuf::MTLCommandBuffer) =
     @objc [obj::id{MPSMatrixRandomMTGP32} synchronizeStateOnCommandBuffer:cmdbuf::id{MTLCommandBuffer}]::Nothing
 
 
diff --git a/test/mps/copy.jl b/test/mps/copy.jl
index 0a7bef801..3c3f2ea15 100644
--- a/test/mps/copy.jl
+++ b/test/mps/copy.jl
@@ -33,7 +33,6 @@ end
     Ts = Ts[.!(Ts .<: IGNORE_UNION)]
     @testset "$T" for T in Ts
         for dim in ((16,16), (10,500), (500,10), (256,512))
-
             srcMat = MtlArray(rand(T, dim))
 
             dstMat = copytest(srcMat, false, false)

From 994cb08716bb1def48d1e2257ea695edc83b32ef Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 27 Aug 2024 14:44:23 -0300
Subject: [PATCH 14/15] Address review comments

---
 test/random.jl | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/random.jl b/test/random.jl
index 8889de8ad..608f03b08 100644
--- a/test/random.jl
+++ b/test/random.jl
@@ -11,10 +11,6 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
     @testset "in-place" begin
         rng = Metal.MPS.RNG()
 
-        # Seed the default generators to work around value of 0 being
-        #  randomly generated in the size 1 Int8 Array test in 1.11
-        Metal.seed!(123)
-
         @testset "$f with $T" for (f, T) in INPLACE_TUPLES
             @testset "$d" for d in (1, 3, (3, 3), (3, 3, 3), 16, (16, 16), (16, 16, 16), (1000,), (1000,1000))
                 A = MtlArray{T}(undef, d)
@@ -250,8 +246,8 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
             a = f(T, d)
             Metal.seed!(1)
             b = f(T, d)
-            # TODO: Remove once https://github.com/JuliaGPU/Metal.jl/issues/331 is fixed
-            @test iszero(collect(a) - collect(b)) broken = (T == Float16 && d == (1000,1000))
+            # TODO: Remove broken parameter once https://github.com/JuliaGPU/GPUArrays.jl/issues/530 is fixed
+            @test Array(a) == Array(b) broken = (T == Float16 && d == (1000,1000))
         end
     end
 end # testset

From b729034f4518c0b27346284be1e1f0c7812512e6 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 27 Aug 2024 16:54:02 -0300
Subject: [PATCH 15/15] Correct version in docstring (and trigger CI)

---
 docs/src/usage/array.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index bd1a9832e..0a121df75 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -147,7 +147,7 @@ julia> a = Random.rand!(GPUArrays.default_rng(MtlArray), a)
 ```
 
 !!! note
-    `MPSMatrixRandom` functionality requires Metal.jl >= v2.0
+    `MPSMatrixRandom` functionality requires Metal.jl >= v1.4
 
 !!! warning
     `Random.rand!(::MPS.RNG, args...)` and `Random.randn!(::MPS.RNG, args...)` have a framework limitation that requires the byte offset and byte size of the destination array to be a multiple of 4.