diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl
index e7544c1f..125f41bf 100644
--- a/src/device/intrinsics/math.jl
+++ b/src/device/intrinsics/math.jl
@@ -418,7 +418,7 @@ end
     j = fma(1.442695f0, a, 12582912.0f0)
     j = j - 12582912.0f0
     i = unsafe_trunc(Int32, j)
-    f = fma(j, -6.93145752f-1, a) # log_2_hi 
+    f = fma(j, -6.93145752f-1, a) # log_2_hi
     f = fma(j, -1.42860677f-6, f) # log_2_lo
 
     # approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2]
@@ -460,4 +460,4 @@ end
     end
 
     return r
-end
\ No newline at end of file
+end
diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl
index e8815797..8d83e92b 100644
--- a/src/device/intrinsics/simd.jl
+++ b/src/device/intrinsics/simd.jl
@@ -7,7 +7,7 @@ function convert_origin(origin::NTuple{2, Int64})
     return (VecElement{Int64}(origin[1]-1), VecElement{Int64}(origin[2]-1))
 end
 
-for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf18"))
+for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf16"))
     for as in (AS.Device, AS.ThreadGroup)
         @eval begin
             @device_function simdgroup_load(
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
index 3b5155b7..a9081224 100644
--- a/test/device/intrinsics.jl
+++ b/test/device/intrinsics.jl
@@ -275,9 +275,9 @@ end
 end
 
 @testset "parametrically typed" begin
-    typs = [Int32, Int64, Float32]
+    types = [Int32, Int64, Float32]
     metal_support() >= v"3.1" && push!(types, BFloat16)
-    @testset for typ in typs
+    @testset for typ in types
         function kernel(d::MtlDeviceArray{T}, n) where {T}
             t = thread_position_in_threadgroup_1d()
             tr = n-t+1
@@ -405,8 +405,9 @@ end
             return
         end
 
-        a = MtlArray(rand(typ, 8, 8))
-        b = MtlArray(rand(typ, 8, 8))
+        #Use `ones` for figuring out issues
+        a = MtlArray(ones(typ, 8, 8))
+        b = MtlArray(ones(typ, 8, 8))
         c = MtlArray(zeros(typ, 8, 8))
         @metal threads=(8, 8) kernel(a, b, c)
         @test Array(a) * Array(b) ≈ Array(c)
diff --git a/test/runtests.jl b/test/runtests.jl
index 0554e868..584dc318 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -81,6 +81,12 @@ const gpuarr_eltypes = [Int16, Int32, Int64,
                         ComplexF16, ComplexF32]
 const gpuarr_eltypes_nobf16 = copy(gpuarr_eltypes)
 
+# don't test BFloat16 for unsupported operations
+nobf16_tests = ["random", "reductions/reducedim!",
+        "reductions/mapreducedim!_large", "reductions/mapreduce",
+        "reductions/== isequal", "reductions/minimum maximum extrema",
+        "reductions/sum prod", "reductions/mapreducedim!", "reductions/reduce"]
+
 # Add BFloat16 for tests that use it
 Metal.metal_support() >= v"3.1" && push!(gpuarr_eltypes, BFloat16)
 
@@ -90,7 +96,7 @@ for name in keys(TestSuite.tests)
         continue
     end
 
-    tmp_eltypes = name in ["random"] ? gpuarr_eltypes_nobf16 : gpuarr_eltypes
+    tmp_eltypes = name in nobf16_tests ? gpuarr_eltypes_nobf16 : gpuarr_eltypes
 
     push!(tests, "gpuarrays$(Base.Filesystem.path_separator)$name")
     test_runners["gpuarrays$(Base.Filesystem.path_separator)$name"] = ()->TestSuite.tests[name](MtlArray;eltypes=tmp_eltypes)