diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6d488597..a6d13a85 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -170,4 +170,6 @@ steps:
               - "Boltz"
 
 env:
+  RETESTITEMS_NWORKERS: 4
+  RETESTITEMS_NWORKER_THREADS: 2
   SECRET_CODECOV_TOKEN: "wMpDLaAVEHe6EJAc+LZBl4jF3wADVN6F+15vr/ONJHOv/XXbtYovuc1PCQwhz0AzZjWpSO12IDTyKfwVgYvqaGYfQ9yGyplJtSu2MiL2k44B/IY+wEZhsfkBIhXlG89si5A/I+/f8T8QuwxBqBLh8fYq7oxC+gNzKhbj8vIT4n5hCusvYYGufgKRC2U9P4ij0Sf40egQ5B+StaTykqJNq1163UARjNBypHIVDbYE0HUHiF7WB4eI5LxBBzlcHmsUkuGp6ZlqAu/8C83k65lwDnyHDfjvBM24q9GQTDFA5r7RUfYKHElQEBPk3GhoJn7XGIfD2pC0VNcw5jYCwsX2mw==;U2FsdGVkX1+euKMib66zno5Kkw7OxXo6v4RnkAA/HElJM46qfX17VgZ9iVLg45jOOWRgghmyYuy2WQ8RcVbuOg=="
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 9b52f3e8..92a52376 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -38,9 +38,15 @@ jobs:
       - uses: julia-actions/julia-runtest@v1
         env:
           GROUP: "CPU"
+          RETESTITEMS_NWORKERS: 4
+          RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
         with:
           directories: src,ext
       - uses: codecov/codecov-action@v4
         with:
           files: lcov.info
+          token: ${{ secrets.CODECOV_TOKEN }}
+          verbose: true
+          fail_ci_if_error: true
+
diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
new file mode 100644
index 00000000..afeac18b
--- /dev/null
+++ b/.github/workflows/Downgrade.yml
@@ -0,0 +1,41 @@
+name: Downgrade
+on:
+  pull_request:
+    branches:
+      - main
+    paths-ignore:
+      - 'docs/**'
+  push:
+    branches:
+      - master
+    paths-ignore:
+      - 'docs/**'
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        version: ['1.9']
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+      - uses: cjdoris/julia-downgrade-compat-action@v1
+        with:
+          skip: Pkg,TOML
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+        env:
+          GROUP: "CPU"
+          RETESTITEMS_NWORKERS: 4
+          RETESTITEMS_NWORKER_THREADS: 2
+      - uses: julia-actions/julia-processcoverage@v1
+        with:
+          directories: src,ext
+      - uses: codecov/codecov-action@v4
+        with:
+          files: lcov.info
+          token: ${{ secrets.CODECOV_TOKEN }}
+          verbose: true
+          fail_ci_if_error: true
diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
index edd131d1..16223f28 100644
--- a/.github/workflows/Downstream.yml
+++ b/.github/workflows/Downstream.yml
@@ -54,9 +54,15 @@ jobs:
             @info "Not compatible with this release. No problem." exception=err
             exit(0)  # Exit immediately, as a success
           end
+        env:
+          RETESTITEMS_NWORKERS: 4
+          RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
         with:
           directories: src,ext
       - uses: codecov/codecov-action@v4
         with:
-          files: lcov.info
\ No newline at end of file
+          files: lcov.info
+          token: ${{ secrets.CODECOV_TOKEN }}
+          verbose: true
+          fail_ci_if_error: true
\ No newline at end of file
diff --git a/test/LocalPreferences.toml b/LocalPreferences.toml
similarity index 100%
rename from test/LocalPreferences.toml
rename to LocalPreferences.toml
diff --git a/Project.toml b/Project.toml
index 38f0ed20..5797e3bf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "0.3.9"
+version = "0.3.10"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -27,22 +27,43 @@ LuxLibReverseDiffExt = "ReverseDiff"
 LuxLibTrackerExt = "Tracker"
 
 [compat]
-ChainRulesCore = "1"
+Aqua = "0.8"
+ChainRulesCore = "1.20"
+ComponentArrays = "0.15"
 ForwardDiff = "0.10"
-KernelAbstractions = "0.9"
-LuxCUDA = "0.2, 0.3"
-Markdown = "1"
-NNlib = "0.8, 0.9"
-PrecompileTools = "1"
-Random = "1"
+KernelAbstractions = "0.9.2"
+LuxAMDGPU = "0.2"
+LuxCUDA = "0.3"
+LuxTestUtils = "0.1.15"
+Markdown = "1.9"
+NNlib = "0.9"
+PrecompileTools = "1.2"
+Random = "1.9"
+ReTestItems = "1"
 Reexport = "1"
 ReverseDiff = "1"
-Statistics = "1"
+StableRNGs = "1"
+Statistics = "1.9"
+Test = "1.9"
 Tracker = "0.2"
+Zygote = "0.6"
 julia = "1.9"
 
 [extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LuxAMDGPU = "83120cb1-ca15-4f04-bf3b-6967d2e6b60b"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
-Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReTestItems = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[targets]
+test = ["Aqua", "ChainRulesCore", "ComponentArrays", "ForwardDiff", "LuxAMDGPU", "LuxCUDA", "LuxTestUtils", "Random", "ReTestItems", "Reexport", "StableRNGs", "Statistics", "Test", "Zygote"]
diff --git a/ext/LuxLibForwardDiffExt.jl b/ext/LuxLibForwardDiffExt.jl
index e6c52330..36818419 100644
--- a/ext/LuxLibForwardDiffExt.jl
+++ b/ext/LuxLibForwardDiffExt.jl
@@ -5,9 +5,7 @@ import ForwardDiff: Dual
 import LuxLib: AA
 
 # dropout
-function LuxLib._dropout_fptype(x::AA{<:Dual})
-    return ForwardDiff.valtype(eltype(x))
-end
+LuxLib._dropout_fptype(x::AA{<:Dual}) = ForwardDiff.valtype(eltype(x))
 
 # Convolutions: We might want to capture these furthur down in `conv!`
 # NOTE: In principle we can concatenate all of the partials along the batch dimension
@@ -45,10 +43,14 @@ for op in [:conv, :depthwiseconv]
 
         y = $(op)(x_, w_, cdims; kwargs...)
 
-        dys₁ = ntuple(_ -> similar(x_, Vₓ, NNlib.output_size(cdims)...,
-                NNlib.channels_out(cdims), size(x, N)), P)
-        dys₂ = ntuple(_ -> similar(x_, Vₓ, NNlib.output_size(cdims)...,
-                NNlib.channels_out(cdims), size(x, N)), P)
+        dys₁ = ntuple(
+            _ -> similar(x_, Vₓ, NNlib.output_size(cdims)...,
+                NNlib.channels_out(cdims), size(x, N)),
+            P)
+        dys₂ = ntuple(
+            _ -> similar(x_, Vₓ, NNlib.output_size(cdims)...,
+                NNlib.channels_out(cdims), size(x, N)),
+            P)
         for i in 1:P
             $(op!)(dys₁[i], ForwardDiff.partials.(x, i), w_, cdims; kwargs...)
             $(op!)(dys₂[i], x_, ForwardDiff.partials.(w, i), cdims; kwargs...)
diff --git a/ext/LuxLibLuxCUDAExt/LuxLibLuxCUDAExt.jl b/ext/LuxLibLuxCUDAExt/LuxLibLuxCUDAExt.jl
index 78c347d1..e388950f 100644
--- a/ext/LuxLibLuxCUDAExt/LuxLibLuxCUDAExt.jl
+++ b/ext/LuxLibLuxCUDAExt/LuxLibLuxCUDAExt.jl
@@ -2,9 +2,8 @@ module LuxLibLuxCUDAExt
 
 using LuxCUDA, LuxLib
 import ChainRulesCore as CRC
-import LuxLib: batchnorm,
-    batchnorm_cudnn, ∇batchnorm_cudnn, _get_batchnorm_statistics,
-    FP_32_64, ∂∅
+import LuxLib: batchnorm, batchnorm_cudnn, ∇batchnorm_cudnn, _get_batchnorm_statistics,
+               FP_32_64, ∂∅
 
 include("batchnorm.jl")
 
diff --git a/ext/LuxLibLuxCUDAExt/batchnorm.jl b/ext/LuxLibLuxCUDAExt/batchnorm.jl
index dd4c68c2..14e9de58 100644
--- a/ext/LuxLibLuxCUDAExt/batchnorm.jl
+++ b/ext/LuxLibLuxCUDAExt/batchnorm.jl
@@ -1,8 +1,9 @@
 using LuxCUDA
 using .cuDNN: CUDNN_BN_MIN_EPSILON, cudnnBatchNormalizationBackward,
-    cudnnBatchNormalizationForwardInference, CUDNN_BATCHNORM_SPATIAL,
-    cudnnBatchNormalizationForwardTraining, cudnnTensorDescriptor, CUDNN_TENSOR_NCHW,
-    cudnnDataType, dim4, scalingParameter, handle
+              cudnnBatchNormalizationForwardInference, CUDNN_BATCHNORM_SPATIAL,
+              cudnnBatchNormalizationForwardTraining, cudnnTensorDescriptor,
+              CUDNN_TENSOR_NCHW,
+              cudnnDataType, dim4, scalingParameter, handle
 import LuxLib: FP_32_64
 
 # NOTE: This can be upstreamed to LuxCUDA once we drop support for v1.6
@@ -169,7 +170,8 @@ function cudnnBNBackward!(∂g::DenseCuArray{T}, g::DenseCuArray{T}, ∂b::Dense
     xd = cudnnTensorDescriptor(x)
     ∂yd = cudnnTensorDescriptor(∂y)
     ∂xd = cudnnTensorDescriptor(∂x)
-    gd = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(_wsize(x))),
+    gd = cudnnTensorDescriptor(
+        CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(_wsize(x))),
         dim4(_wsize(x), Val(CUDNN_TENSOR_NCHW)))
 
     xmean = xmean === nothing ? CU_NULL : xmean
diff --git a/ext/LuxLibLuxCUDATrackerExt.jl b/ext/LuxLibLuxCUDATrackerExt.jl
index 06f45a8a..782f0c08 100644
--- a/ext/LuxLibLuxCUDATrackerExt.jl
+++ b/ext/LuxLibLuxCUDATrackerExt.jl
@@ -2,9 +2,9 @@ module LuxLibLuxCUDATrackerExt
 
 using LuxCUDA, LuxLib, Tracker
 import Tracker: @grad,
-    data, nobacksies, track, TrackedArray, TrackedVector, TrackedReal
+                data, nobacksies, track, TrackedArray, TrackedVector, TrackedReal
 import LuxLib: AA, AV, batchnorm_cudnn, ∇batchnorm_cudnn, _get_batchnorm_statistics,
-    FP_32_64, ∂∅, __is_tracked
+               FP_32_64, ∂∅, __is_tracked
 
 # api/batchnorm.jl
 const TR_CUDNN_BN_ARRAY_TYPE = Union{TrackedArray{<:Any, <:Any, <:CuArray{<:FP_32_64, 2}},
diff --git a/ext/LuxLibReverseDiffExt.jl b/ext/LuxLibReverseDiffExt.jl
index 129282cd..d9ae9088 100644
--- a/ext/LuxLibReverseDiffExt.jl
+++ b/ext/LuxLibReverseDiffExt.jl
@@ -3,8 +3,8 @@ module LuxLibReverseDiffExt
 using ChainRulesCore, LuxLib, ReverseDiff
 import ChainRulesCore as CRC
 import LuxLib: AA, __is_tracked
-import ReverseDiff: TrackedArray,
-    TrackedReal, decrement_deriv!, increment_deriv!, value, @grad_from_chainrules
+import ReverseDiff: TrackedArray, TrackedReal, decrement_deriv!, increment_deriv!, value,
+                    @grad_from_chainrules
 
 # Patches: Needs upstreaming
 @inline function increment_deriv!(t::Union{TrackedArray, TrackedReal}, ::NoTangent, i)
diff --git a/src/LuxLib.jl b/src/LuxLib.jl
index 799f4ed3..b4068fdf 100644
--- a/src/LuxLib.jl
+++ b/src/LuxLib.jl
@@ -23,7 +23,7 @@ include("api/groupnorm.jl")
 include("api/instancenorm.jl")
 include("api/layernorm.jl")
 
-export batchnorm, groupnorm, instancenorm, layernorm
-export alpha_dropout, dropout
+export batchnorm, groupnorm, instancenorm, layernorm,
+       alpha_dropout, dropout
 
 end
diff --git a/src/impl/groupnorm.jl b/src/impl/groupnorm.jl
index facbf38d..fcf96c15 100644
--- a/src/impl/groupnorm.jl
+++ b/src/impl/groupnorm.jl
@@ -1,7 +1,7 @@
 # Low-Level Kernels
 ## Original Implementation: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/group_norm_op.cu
-@kernel function _compute_fused_params_kernel!(scale, bias, @Const(C), @Const(K), @Const(μ),
-        @Const(σ⁻¹), @Const(γ), @Const(β))
+@kernel function _compute_fused_params_kernel!(scale, bias, @Const(C), @Const(K),
+        @Const(μ), @Const(σ⁻¹), @Const(γ), @Const(β))
     idx = @index(Global)
     ng = _div_idx(idx, K)
     c = _mod_idx(idx, C)
@@ -27,8 +27,8 @@ end
     @inbounds dY_dscale[idx] = γ[c] * σ⁻¹[ng]
 end
 
-@kernel function _groupnorm_xscale_and_bias_kernel!(X_scale, bias, @Const(alpha), @Const(μ),
-        @Const(σ⁻¹), @Const(ds_sum), @Const(db_sum))
+@kernel function _groupnorm_xscale_and_bias_kernel!(X_scale, bias, @Const(alpha),
+        @Const(μ), @Const(σ⁻¹), @Const(ds_sum), @Const(db_sum))
     idx = @index(Global)
     @inbounds x = (db_sum[idx] * μ[idx] - ds_sum[idx]) * (σ⁻¹[idx]^3) * alpha
     @inbounds X_scale[idx] = x
diff --git a/test/Project.toml b/test/Project.toml
deleted file mode 100644
index 892c199a..00000000
--- a/test/Project.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-[deps]
-Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
-LuxAMDGPU = "83120cb1-ca15-4f04-bf3b-6967d2e6b60b"
-LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
-LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
-LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
-StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-TestSetExtensions = "98d24dd4-01ad-11ea-1b02-c9a08f80db04"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/test/api/batchnorm.jl b/test/api/batchnorm.jl
deleted file mode 100644
index cc739f69..00000000
--- a/test/api/batchnorm.jl
+++ /dev/null
@@ -1,56 +0,0 @@
-using LuxLib, Test
-
-include("../test_utils.jl")
-
-rng = get_stable_rng(12345)
-
-function _setup_batchnorm(aType, T, sz; affine::Bool=true, track_stats::Bool)
-    x = randn(T, sz) |> aType
-    scale = affine ? aType(randn(T, sz[end - 1])) : nothing
-    bias = affine ? aType(randn(T, sz[end - 1])) : nothing
-
-    if track_stats
-        running_mean = randn(T, sz[end - 1]) |> aType
-        running_var = abs2.(randn(T, sz[end - 1])) |> aType
-        return x, scale, bias, running_mean, running_var
-    else
-        return x, scale, bias, nothing, nothing
-    end
-end
-
-@testset "$mode: Batch Normalization" for (mode, aType, on_gpu) in MODES
-    for T in (Float16, Float32, Float64),
-        sz in ((4, 4, 6, 2), (8, 2), (4, 4, 4, 3, 2)),
-        training in (Val(true), Val(false)),
-        affine in (true, false),
-        track_stats in (true, false)
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        _f = (args...) -> batchnorm(args...; epsilon, training, momentum=T(0.9))
-
-        epsilon = T(1e-5)
-        x, scale, bias, rm, rv = _setup_batchnorm(aType, T, sz; track_stats, affine)
-
-        y, nt = batchnorm(x, scale, bias, rm, rv; epsilon, training, momentum=T(0.9))
-
-        @inferred batchnorm(x, scale, bias, rm, rv; epsilon, training, momentum=T(0.9))
-
-        @jet _f(x, scale, bias, rm, rv)
-
-        @test y isa aType{T, length(sz)}
-        @test size(y) == sz
-
-        if rm !== nothing
-            @test size(nt.running_mean) == (size(x, length(sz) - 1),)
-            @test size(nt.running_var) == (size(x, length(sz) - 1),)
-        end
-
-        if __istraining(training) && affine
-            fp16 = T == Float16
-            __f = (args...) -> sum(first(batchnorm(x, args..., rm, rv; epsilon,
-                training, momentum=T(0.9))))
-            @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2
-        end
-    end
-end
diff --git a/test/api/batchnorm_tests.jl b/test/api/batchnorm_tests.jl
new file mode 100644
index 00000000..581e1a59
--- /dev/null
+++ b/test/api/batchnorm_tests.jl
@@ -0,0 +1,54 @@
+@testitem "Batch Normalization" setup=[SharedTestSetup] begin
+    rng = get_stable_rng(12345)
+
+    function _setup_batchnorm(aType, T, sz; affine::Bool=true, track_stats::Bool)
+        x = randn(T, sz) |> aType
+        scale = affine ? aType(randn(T, sz[end - 1])) : nothing
+        bias = affine ? aType(randn(T, sz[end - 1])) : nothing
+
+        if track_stats
+            running_mean = randn(T, sz[end - 1]) |> aType
+            running_var = abs2.(randn(T, sz[end - 1])) |> aType
+            return x, scale, bias, running_mean, running_var
+        else
+            return x, scale, bias, nothing, nothing
+        end
+    end
+
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        for T in (Float16, Float32, Float64),
+            sz in ((4, 4, 6, 2), (8, 2), (4, 4, 4, 3, 2)),
+            training in (Val(true), Val(false)),
+            affine in (true, false),
+            track_stats in (true, false)
+
+            T === Float16 && mode == "AMDGPU" && continue
+
+            _f = (args...) -> batchnorm(args...; epsilon, training, momentum=T(0.9))
+
+            epsilon = T(1e-5)
+            x, scale, bias, rm, rv = _setup_batchnorm(aType, T, sz; track_stats, affine)
+
+            y, nt = batchnorm(x, scale, bias, rm, rv; epsilon, training, momentum=T(0.9))
+
+            @inferred batchnorm(x, scale, bias, rm, rv; epsilon, training, momentum=T(0.9))
+
+            @jet _f(x, scale, bias, rm, rv)
+
+            @test y isa aType{T, length(sz)}
+            @test size(y) == sz
+
+            if rm !== nothing
+                @test size(nt.running_mean) == (size(x, length(sz) - 1),)
+                @test size(nt.running_var) == (size(x, length(sz) - 1),)
+            end
+
+            if __istraining(training) && affine
+                fp16 = T == Float16
+                __f = (args...) -> sum(first(batchnorm(x, args..., rm, rv; epsilon,
+                    training, momentum=T(0.9))))
+                @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2
+            end
+        end
+    end
+end
diff --git a/test/api/dropout.jl b/test/api/dropout.jl
deleted file mode 100644
index 34bba846..00000000
--- a/test/api/dropout.jl
+++ /dev/null
@@ -1,156 +0,0 @@
-using Statistics, Test, LuxLib
-
-include("../test_utils.jl")
-
-rng = get_stable_rng(12345)
-
-@testset "$mode: Dropout" for (mode, aType, on_gpu) in MODES
-    for T in (Float16, Float32, Float64),
-        x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        x = randn(rng, T, x_shape) |> aType
-
-        @inferred dropout(rng, x, T(0.5), Val(true); dims=Colon())
-
-        y, mask_, rng_ = dropout(rng, x, T(0.5), Val(true); dims=Colon())
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test mask_ isa aType{T, length(x_shape)}
-        @test size(mask_) == x_shape
-        @test rng != rng_
-
-        __f = x -> sum(first(dropout(rng, x, T(0.5), Val(true); dims=Colon())))
-
-        fp16 = T == Float16
-        @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
-        @jet sum(first(dropout(rng, x, T(0.5), Val(true); dims=Colon())))
-
-        @inferred dropout(rng, x, T(0.5), Val(true); dims=Colon())
-
-        y, mask_, rng_ = dropout(rng, x, T(0.5), Val(false); dims=Colon())
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test rng == rng_
-        @test y == x
-    end
-end
-
-@testset "$mode: Dropout with Preset Mask" for (mode, aType, on_gpu) in MODES
-    for T in (Float16, Float32, Float64),
-        x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        x = randn(rng, T, x_shape) |> aType
-        mask = rand(T, x_shape) |> aType
-
-        # Update mask
-        @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(true); dims=Colon())
-
-        y, mask_, rng_ = dropout(rng, x, mask, T(0.5), Val(true), Val(true); dims=Colon())
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test mask_ isa aType{T, length(x_shape)}
-        @test size(mask_) == x_shape
-        @test rng != rng_
-        @test mask != mask_
-
-        __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(true);
-            dims=Colon())))
-
-        fp16 = T == Float16
-        @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
-        @jet sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(true); dims=Colon())))
-
-        # Try using mask if possible (possible!!)
-        @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
-
-        y, mask_, rng_ = dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test mask_ isa aType{T, length(x_shape)}
-        @test size(mask_) == x_shape
-        @test rng == rng_
-        @test mask == mask_
-
-        __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false);
-            dims=Colon())))
-
-        fp16 = T == Float16
-        @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
-        @jet sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())))
-
-        mask = rand(T, (x_shape[1:(end - 1)]..., 13)) |> aType
-
-        # Try using mask if possible (not possible!!)
-        @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
-
-        y, mask_, rng_ = dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test mask_ isa aType{T, length(x_shape)}
-        @test size(mask_) == x_shape
-        @test rng != rng_
-        @test mask != mask_
-
-        __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false);
-            dims=Colon())))
-
-        fp16 = T == Float16
-        @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
-        @jet sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())))
-
-        # Testing Mode
-        @inferred dropout(rng, x, mask, T(0.5), Val(false), Val(false); dims=Colon())
-
-        y, mask_, rng_ = dropout(rng, x, mask, T(0.5), Val(false), Val(false); dims=Colon())
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test mask_ isa aType{T, length(x_shape)}
-        @test mask_ == mask
-        @test rng == rng_
-    end
-end
-
-@testset "$mode: Alpha Dropout" for (mode, aType, on_gpu) in MODES
-    for T in (Float16, Float32, Float64),
-        x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        x = randn(rng, T, x_shape) |> aType
-
-        @inferred alpha_dropout(rng, x, T(0.5), Val(true))
-
-        y, rng_ = alpha_dropout(rng, x, T(0.5), Val(true))
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test rng != rng_
-
-        @test_broken isapprox(std(y), std(x); atol=1.0f-2, rtol=1.0f-2)
-
-        __f = x -> sum(first(alpha_dropout(rng, x, T(0.5), Val(true))))
-
-        fp16 = T == Float16
-        @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
-        @jet sum(first(alpha_dropout(rng, x, T(0.5), Val(true))))
-
-        @inferred alpha_dropout(rng, x, T(0.5), Val(false))
-
-        y, rng_ = alpha_dropout(rng, x, T(0.5), Val(false))
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-        @test rng == rng_
-        @test y == x
-    end
-end
diff --git a/test/api/dropout_tests.jl b/test/api/dropout_tests.jl
new file mode 100644
index 00000000..816156b8
--- /dev/null
+++ b/test/api/dropout_tests.jl
@@ -0,0 +1,171 @@
+@testitem "Dropout" setup=[SharedTestSetup] begin
+    using Statistics
+
+    rng = get_stable_rng(12345)
+
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        for T in (Float16, Float32, Float64),
+            x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
+
+            T === Float16 && mode == "AMDGPU" && continue
+
+            x = randn(rng, T, x_shape) |> aType
+
+            @inferred dropout(rng, x, T(0.5), Val(true); dims=Colon())
+
+            y, mask_, rng_ = dropout(rng, x, T(0.5), Val(true); dims=Colon())
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test mask_ isa aType{T, length(x_shape)}
+            @test size(mask_) == x_shape
+            @test rng != rng_
+
+            __f = x -> sum(first(dropout(rng, x, T(0.5), Val(true); dims=Colon())))
+
+            fp16 = T == Float16
+            @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
+            @jet sum(first(dropout(rng, x, T(0.5), Val(true); dims=Colon())))
+
+            @inferred dropout(rng, x, T(0.5), Val(true); dims=Colon())
+
+            y, mask_, rng_ = dropout(rng, x, T(0.5), Val(false); dims=Colon())
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test rng == rng_
+            @test y == x
+        end
+    end
+end
+
+@testitem "Dropout with Preset Mask" setup=[SharedTestSetup] begin
+    using Statistics
+
+    rng = get_stable_rng(12345)
+
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        for T in (Float16, Float32, Float64),
+            x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
+
+            T === Float16 && mode == "AMDGPU" && continue
+
+            x = randn(rng, T, x_shape) |> aType
+            mask = rand(T, x_shape) |> aType
+
+            # Update mask
+            @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(true); dims=Colon())
+
+            y, mask_, rng_ = dropout(
+                rng, x, mask, T(0.5), Val(true), Val(true); dims=Colon())
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test mask_ isa aType{T, length(x_shape)}
+            @test size(mask_) == x_shape
+            @test rng != rng_
+            @test mask != mask_
+
+            __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(true);
+                dims=Colon())))
+
+            fp16 = T == Float16
+            @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
+            @jet sum(first(dropout(
+                rng, x, mask, T(0.5), Val(true), Val(true); dims=Colon())))
+
+            # Try using mask if possible (possible!!)
+            @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
+
+            y, mask_, rng_ = dropout(
+                rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test mask_ isa aType{T, length(x_shape)}
+            @test size(mask_) == x_shape
+            @test rng == rng_
+            @test mask == mask_
+
+            __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false);
+                dims=Colon())))
+            fp16 = T == Float16
+            @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
+            @jet sum(first(dropout(
+                rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())))
+            mask = rand(T, (x_shape[1:(end - 1)]..., 13)) |> aType
+
+            # Try using mask if possible (not possible!!)
+            @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
+
+            y, mask_, rng_ = dropout(
+                rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test mask_ isa aType{T, length(x_shape)}
+            @test size(mask_) == x_shape
+            @test rng != rng_
+            @test mask != mask_
+
+            __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false);
+                dims=Colon())))
+            fp16 = T == Float16
+            @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
+            @jet sum(first(dropout(
+                rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon())))
+            # Testing Mode
+            @inferred dropout(rng, x, mask, T(0.5), Val(false), Val(false); dims=Colon())
+
+            y, mask_, rng_ = dropout(
+                rng, x, mask, T(0.5), Val(false), Val(false); dims=Colon())
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test mask_ isa aType{T, length(x_shape)}
+            @test mask_ == mask
+            @test rng == rng_
+        end
+    end
+end
+
+@testitem "Alpha Dropout" setup=[SharedTestSetup] begin
+    using Statistics
+
+    rng = get_stable_rng(12345)
+
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        for T in (Float16, Float32, Float64),
+            x_shape in ((2, 3), (2, 2, 3), (2, 2, 3, 1), (2, 2, 1, 3, 1))
+
+            T === Float16 && mode == "AMDGPU" && continue
+
+            x = randn(rng, T, x_shape) |> aType
+
+            @inferred alpha_dropout(rng, x, T(0.5), Val(true))
+
+            y, rng_ = alpha_dropout(rng, x, T(0.5), Val(true))
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test rng != rng_
+
+            @test_broken isapprox(std(y), std(x); atol=1.0f-2, rtol=1.0f-2)
+
+            __f = x -> sum(first(alpha_dropout(rng, x, T(0.5), Val(true))))
+
+            fp16 = T == Float16
+            @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu
+            @jet sum(first(alpha_dropout(rng, x, T(0.5), Val(true))))
+
+            @inferred alpha_dropout(rng, x, T(0.5), Val(false))
+
+            y, rng_ = alpha_dropout(rng, x, T(0.5), Val(false))
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+            @test rng == rng_
+            @test y == x
+        end
+    end
+end
diff --git a/test/api/groupnorm.jl b/test/api/groupnorm.jl
deleted file mode 100644
index 55931fe8..00000000
--- a/test/api/groupnorm.jl
+++ /dev/null
@@ -1,89 +0,0 @@
-using LuxLib, Test
-
-include("../test_utils.jl")
-
-function _setup_groupnorm(aType, T, sz, groups)
-    x = randn(T, sz) |> aType
-    scale = randn(T, sz[end - 1]) |> aType
-    bias = randn(T, sz[end - 1]) |> aType
-    return x, scale, bias
-end
-
-function _groupnorm_generic_fallback(x, scale, bias, epsilon, groups)
-    sz = size(x)
-    N = ndims(x)
-    x_reshaped = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ groups, groups, sz[N])
-    x_, xmean, xvar = LuxLib._normalization(x_reshaped, nothing, nothing, scale, bias,
-        Val(Tuple(collect(1:(N - 1)))), Val(false), nothing, epsilon)
-
-    return reshape(x_, sz)
-end
-
-@testset "$mode: GroupNorm KernelAbstractions" for (mode, aType, on_gpu) in MODES
-    @testset "eltype $T, size $sz, ngroups $groups" for T in (Float32,
-            Float64), sz in ((16, 16, 6, 4), (32, 32, 6, 4), (64, 64, 12, 4)),
-        groups in (2, 3)
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        _f = (args...) -> groupnorm(args...; groups, epsilon)
-
-        epsilon = T(1e-5)
-        x, scale, bias = _setup_groupnorm(aType, T, sz, groups)
-
-        y = _f(x, scale, bias)
-
-        gs_x, gs_scale, gs_bias = Zygote.gradient(sum ∘ _f, x, scale, bias)
-
-        @inferred groupnorm(x, scale, bias; groups, epsilon)
-
-        # @jet _f(x, scale, bias)  # test_call throws exception
-        LuxTestUtils.JET.@test_opt target_modules=(LuxLib,) _f(x, scale, bias)
-
-        @test y isa aType{T, length(sz)}
-        @test size(y) == sz
-
-        # Use the generic implementation to compare against
-        __f = (args...) -> _groupnorm_generic_fallback(args..., epsilon, groups)
-
-        y_ = __f(x, scale, bias)
-
-        gs_x_, gs_scale_, gs_bias_ = Zygote.gradient(sum ∘ __f, x, scale, bias)
-
-        # The KA implementation reorders operations manually for maximal
-        # performance. Hence equality cannot be guaranteed.
-        @test check_approx(y, y_; atol=1.0f-3, rtol=1.0f-3)
-        @test check_approx(gs_x, gs_x_; atol=1.0f-3, rtol=1.0f-3)
-        @test check_approx(gs_scale, gs_scale_; atol=1.0f-3, rtol=1.0f-3)
-        @test check_approx(gs_bias, gs_bias_; atol=1.0f-3, rtol=1.0f-3)
-
-        fp16 = T == Float16
-        __f = (args...) -> sum(groupnorm(x, args...; groups, epsilon))
-        @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=1.0f-3 rtol=1.0f-3 soft_fail=$fp16
-    end
-end
-
-@testset "$mode: GroupNorm Generic Fallback" for (mode, aType, on_gpu) in MODES
-    @testset "eltype $T, size $sz, ngroups $groups" for T in (Float16,
-            Float32, Float64), sz in ((4, 6, 2), (8, 8, 8, 6, 2), (3, 16, 16, 12, 2)),
-        groups in (2, 3)
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        _f = (args...) -> groupnorm(args...; groups, epsilon)
-
-        epsilon = T(1e-5)
-        x, scale, bias = _setup_groupnorm(aType, T, sz, groups)
-        y = _f(x, scale, bias)
-
-        @inferred groupnorm(x, scale, bias; groups, epsilon)
-        @jet _f(x, scale, bias)
-
-        @test y isa aType{T, length(sz)}
-        @test size(y) == sz
-
-        fp16 = T == Float16
-        __f = (args...) -> sum(groupnorm(x, args...; groups, epsilon))
-        @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16
-    end
-end
diff --git a/test/api/groupnorm_tests.jl b/test/api/groupnorm_tests.jl
new file mode 100644
index 00000000..64fdc2fe
--- /dev/null
+++ b/test/api/groupnorm_tests.jl
@@ -0,0 +1,95 @@
+@testsetup module GroupNormSetup
+using LuxLib
+
+function _setup_groupnorm(aType, T, sz, groups)
+    x = randn(T, sz) |> aType
+    scale = randn(T, sz[end - 1]) |> aType
+    bias = randn(T, sz[end - 1]) |> aType
+    return x, scale, bias
+end
+
+function _groupnorm_generic_fallback(x, scale, bias, epsilon, groups)
+    sz = size(x)
+    N = ndims(x)
+    x_reshaped = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ groups, groups, sz[N])
+    x_, xmean, xvar = LuxLib._normalization(x_reshaped, nothing, nothing, scale, bias,
+        Val(Tuple(collect(1:(N - 1)))), Val(false), nothing, epsilon)
+
+    return reshape(x_, sz)
+end
+
+export _setup_groupnorm, _groupnorm_generic_fallback
+end
+
+@testitem "Group Normalization KernelAbstractions" setup=[SharedTestSetup, GroupNormSetup] begin
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        @testset "eltype $T, size $sz, ngroups $groups" for T in (Float32, Float64),
+            sz in ((16, 16, 6, 4), (32, 32, 6, 4), (64, 64, 12, 4)),
+            groups in (2, 3)
+
+            T === Float16 && mode == "AMDGPU" && continue
+
+            _f = (args...) -> groupnorm(args...; groups, epsilon)
+
+            epsilon = T(1e-5)
+            x, scale, bias = _setup_groupnorm(aType, T, sz, groups)
+
+            y = _f(x, scale, bias)
+
+            gs_x, gs_scale, gs_bias = Zygote.gradient(sum ∘ _f, x, scale, bias)
+
+            @inferred groupnorm(x, scale, bias; groups, epsilon)
+
+            # @jet _f(x, scale, bias)  # test_call throws exception
+            LuxTestUtils.JET.@test_opt target_modules=(LuxLib,) _f(x, scale, bias)
+
+            @test y isa aType{T, length(sz)}
+            @test size(y) == sz
+
+            # Use the generic implementation to compare against
+            __f = (args...) -> _groupnorm_generic_fallback(args..., epsilon, groups)
+
+            y_ = __f(x, scale, bias)
+
+            gs_x_, gs_scale_, gs_bias_ = Zygote.gradient(sum ∘ __f, x, scale, bias)
+
+            # The KA implementation reorders operations manually for maximal
+            # performance. Hence equality cannot be guaranteed.
+            @test check_approx(y, y_; atol=1.0f-3, rtol=1.0f-3)
+            @test check_approx(gs_x, gs_x_; atol=1.0f-3, rtol=1.0f-3)
+            @test check_approx(gs_scale, gs_scale_; atol=1.0f-3, rtol=1.0f-3)
+            @test check_approx(gs_bias, gs_bias_; atol=1.0f-3, rtol=1.0f-3)
+
+            fp16 = T == Float16
+            __f = (args...) -> sum(groupnorm(x, args...; groups, epsilon))
+            @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=1.0f-3 rtol=1.0f-3 soft_fail=$fp16
+        end
+    end
+end
+
+@testitem "Group Normalization Generic Fallback" setup=[SharedTestSetup, GroupNormSetup] begin
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        @testset "eltype $T, size $sz, ngroups $groups" for T in (Float16,
+                Float32, Float64), sz in ((4, 6, 2), (8, 8, 8, 6, 2), (3, 16, 16, 12, 2)),
+            groups in (2, 3)
+
+            T === Float16 && mode == "AMDGPU" && continue
+
+            _f = (args...) -> groupnorm(args...; groups, epsilon)
+
+            epsilon = T(1e-5)
+            x, scale, bias = _setup_groupnorm(aType, T, sz, groups)
+            y = _f(x, scale, bias)
+
+            @inferred groupnorm(x, scale, bias; groups, epsilon)
+            @jet _f(x, scale, bias)
+
+            @test y isa aType{T, length(sz)}
+            @test size(y) == sz
+
+            fp16 = T == Float16
+            __f = (args...) -> sum(groupnorm(x, args...; groups, epsilon))
+            @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16
+        end
+    end
+end
diff --git a/test/api/instancenorm.jl b/test/api/instancenorm.jl
deleted file mode 100644
index e318a095..00000000
--- a/test/api/instancenorm.jl
+++ /dev/null
@@ -1,45 +0,0 @@
-using LuxLib, Statistics, Test
-
-include("../test_utils.jl")
-
-rng = get_stable_rng(12345)
-
-function _setup_instancenorm(aType, T, sz; affine::Bool=true)
-    x = randn(T, sz) |> aType
-    scale = affine ? aType(ones(T, sz[end - 1])) : nothing
-    bias = affine ? aType(zeros(T, sz[end - 1])) : nothing
-    return x, scale, bias
-end
-
-@testset "$mode: Instance Norm" for (mode, aType, on_gpu) in MODES
-    for T in (Float16, Float32, Float64),
-        sz in ((4, 4, 6, 2), (3, 4, 2), (4, 4, 4, 3, 2)),
-        training in (Val(true), Val(false)),
-        affine in (true, false)
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        _f = (args...) -> instancenorm(args...; epsilon, training)
-
-        epsilon = T(1e-5)
-        x, scale, bias = _setup_instancenorm(aType, T, sz; affine)
-
-        y, nt = instancenorm(x, scale, bias; epsilon, training)
-
-        @inferred instancenorm(x, scale, bias; epsilon, training)
-        @jet _f(x, scale, bias)
-        @test y isa aType{T, length(sz)}
-        @test size(y) == sz
-
-        _target_std = ones(ntuple(_ -> 1, length(sz) - 2)..., size(x)[(end - 1):end]...)
-        @eval @test check_approx(std(Array($y); dims=1:($(length(sz) - 2))),
-            $_target_std; atol=0.2, rtol=0.2)
-        @test std(y; dims=1:(length(sz) - 2)) != std(x; dims=1:(length(sz) - 2))
-
-        if __istraining(training) && affine
-            fp16 = T == Float16
-            __f = (args...) -> sum(first(instancenorm(x, args...; epsilon, training)))
-            @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu
-        end
-    end
-end
diff --git a/test/api/instancenorm_tests.jl b/test/api/instancenorm_tests.jl
new file mode 100644
index 00000000..8c8ea1e5
--- /dev/null
+++ b/test/api/instancenorm_tests.jl
@@ -0,0 +1,45 @@
+@testitem "Instance Normalization" setup=[SharedTestSetup] begin
+    using Statistics
+
+    rng = get_stable_rng(12345)
+
+    function _setup_instancenorm(aType, T, sz; affine::Bool=true)
+        x = randn(T, sz) |> aType
+        scale = affine ? aType(ones(T, sz[end - 1])) : nothing
+        bias = affine ? aType(zeros(T, sz[end - 1])) : nothing
+        return x, scale, bias
+    end
+    
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        for T in (Float16, Float32, Float64),
+            sz in ((4, 4, 6, 2), (3, 4, 2), (4, 4, 4, 3, 2)),
+            training in (Val(true), Val(false)),
+            affine in (true, false)
+    
+            T === Float16 && mode == "AMDGPU" && continue
+    
+            _f = (args...) -> instancenorm(args...; epsilon, training)
+    
+            epsilon = T(1e-5)
+            x, scale, bias = _setup_instancenorm(aType, T, sz; affine)
+    
+            y, nt = instancenorm(x, scale, bias; epsilon, training)
+    
+            @inferred instancenorm(x, scale, bias; epsilon, training)
+            @jet _f(x, scale, bias)
+            @test y isa aType{T, length(sz)}
+            @test size(y) == sz
+    
+            _target_std = ones(ntuple(_ -> 1, length(sz) - 2)..., size(x)[(end - 1):end]...)
+            @eval @test check_approx(std(Array($y); dims=1:($(length(sz) - 2))),
+                $_target_std; atol=0.2, rtol=0.2)
+            @test std(y; dims=1:(length(sz) - 2)) != std(x; dims=1:(length(sz) - 2))
+    
+            if __istraining(training) && affine
+                fp16 = T == Float16
+                __f = (args...) -> sum(first(instancenorm(x, args...; epsilon, training)))
+                @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu
+            end
+        end
+    end
+end
diff --git a/test/api/layernorm.jl b/test/api/layernorm.jl
deleted file mode 100644
index 1e4282e6..00000000
--- a/test/api/layernorm.jl
+++ /dev/null
@@ -1,48 +0,0 @@
-using LuxLib, Statistics, Test
-
-include("../test_utils.jl")
-
-function _setup_layernorm(aType, T, x_size, affine_shape)
-    x = randn(T, x_size) |> aType
-    if affine_shape !== nothing
-        scale = randn(T, affine_shape..., 1) |> aType
-        bias = randn(T, affine_shape..., 1) |> aType
-        return x, scale, bias
-    else
-        return x, nothing, nothing
-    end
-end
-
-@testset "$mode: LayerNorm" for (mode, aType, on_gpu) in MODES
-    for T in (Float16, Float32, Float64),
-        x_shape in ((3, 3, 2, 1), (2, 2, 2, 1), (2, 3, 2, 2)),
-        affine_shape in (nothing, x_shape[1:3], (1, 1, 1), (1, 1, x_shape[3]))
-
-        T === Float16 && mode == "AMDGPU" && continue
-
-        dims = Colon()
-        epsilon = T(1e-5)
-        _f = (args...) -> layernorm(args...; dims, epsilon)
-
-        x, scale, bias = _setup_layernorm(aType, T, x_shape, affine_shape)
-
-        @inferred _f(x, scale, bias)
-        @jet _f(x, scale, bias)
-
-        y = _f(x, scale, bias)
-
-        @test y isa aType{T, length(x_shape)}
-        @test size(y) == x_shape
-
-        if affine_shape === nothing
-            @test check_approx(mean(y; dims), 0; atol=1e-3, rtol=1e-3)
-            @test check_approx(std(y; dims), 1; atol=1e-1, rtol=1e-1)
-        end
-
-        fp16 = T == Float16
-        if affine_shape !== nothing
-            __f = (args...) -> sum(_f(x, args...))
-            @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu
-        end
-    end
-end
diff --git a/test/api/layernorm_tests.jl b/test/api/layernorm_tests.jl
new file mode 100644
index 00000000..4cd2d9d4
--- /dev/null
+++ b/test/api/layernorm_tests.jl
@@ -0,0 +1,48 @@
+@testitem "Layer Normalization" setup=[SharedTestSetup] begin
+    using Statistics
+
+    function _setup_layernorm(aType, T, x_size, affine_shape)
+        x = randn(T, x_size) |> aType
+        if affine_shape !== nothing
+            scale = randn(T, affine_shape..., 1) |> aType
+            bias = randn(T, affine_shape..., 1) |> aType
+            return x, scale, bias
+        else
+            return x, nothing, nothing
+        end
+    end
+
+    @testset "$mode" for (mode, aType, on_gpu) in MODES
+        for T in (Float16, Float32, Float64),
+            x_shape in ((3, 3, 2, 1), (2, 2, 2, 1), (2, 3, 2, 2)),
+            affine_shape in (nothing, x_shape[1:3], (1, 1, 1), (1, 1, x_shape[3]))
+
+            T === Float16 && mode == "AMDGPU" && continue
+
+            dims = Colon()
+            epsilon = T(1e-5)
+            _f = (args...) -> layernorm(args...; dims, epsilon)
+
+            x, scale, bias = _setup_layernorm(aType, T, x_shape, affine_shape)
+
+            @inferred _f(x, scale, bias)
+            @jet _f(x, scale, bias)
+
+            y = _f(x, scale, bias)
+
+            @test y isa aType{T, length(x_shape)}
+            @test size(y) == x_shape
+
+            if affine_shape === nothing
+                @test check_approx(mean(y; dims), 0; atol=1e-3, rtol=1e-3)
+                @test check_approx(std(y; dims), 1; atol=1e-1, rtol=1e-1)
+            end
+
+            fp16 = T == Float16
+            if affine_shape !== nothing
+                __f = (args...) -> sum(_f(x, args...))
+                @eval @test_gradients $__f $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu
+            end
+        end
+    end
+end
diff --git a/test/aqua.jl b/test/aqua.jl
deleted file mode 100644
index efe7d1e8..00000000
--- a/test/aqua.jl
+++ /dev/null
@@ -1,10 +0,0 @@
-using Aqua, ChainRulesCore, LuxLib, Test
-
-@testset "All Tests (except Ambiguity)" begin
-    Aqua.test_all(LuxLib; ambiguities=false)
-end
-
-@testset "Ambiguity Tests" begin
-    # The exclusions are due to CRC.@nondifferentiable
-    Aqua.test_ambiguities(LuxLib; exclude=[ChainRulesCore.frule, Core.kwcall])
-end
diff --git a/test/aqua_tests.jl b/test/aqua_tests.jl
new file mode 100644
index 00000000..f339224a
--- /dev/null
+++ b/test/aqua_tests.jl
@@ -0,0 +1,4 @@
+@testitem "Aqua: Quality Assurance" begin
+    using Aqua
+    Aqua.test_all(LuxLib)
+end
diff --git a/test/ext/LuxLibForwardDiffExt.jl b/test/ext/LuxLibForwardDiffExt.jl
deleted file mode 100644
index a76e29be..00000000
--- a/test/ext/LuxLibForwardDiffExt.jl
+++ /dev/null
@@ -1,17 +0,0 @@
-using LuxLib, ForwardDiff, Test
-
-include("../test_utils.jl")
-
-rng = get_stable_rng(12345)
-
-@testset "$mode: dropout" for (mode, aType, on_gpu) in MODES
-    x = randn(rng, Float32, 10, 2) |> aType
-    x_dual = ForwardDiff.Dual.(x)
-
-    @test_nowarn dropout(rng, x_dual, 0.5f0, Val(true); dims=:)
-
-    x_dropout = dropout(rng, x, 0.5f0, Val(true); dims=:)[1]
-    x_dual_dropout = ForwardDiff.value.(dropout(rng, x_dual, 0.5f0, Val(true); dims=:)[1])
-
-    @test check_approx(x_dropout, x_dual_dropout)
-end
diff --git a/test/forwarddiff_tests.jl b/test/forwarddiff_tests.jl
new file mode 100644
index 00000000..63139883
--- /dev/null
+++ b/test/forwarddiff_tests.jl
@@ -0,0 +1,95 @@
+@testitem "Efficient JVPs" setup=[SharedTestSetup] begin
+    using ForwardDiff, Zygote, ComponentArrays
+
+    struct LuxLibTestTag end
+
+    # Computes (∂f/∂x)u
+    function jvp_forwarddiff(f, x, u)
+        uu = reshape(u, axes(x))
+        y = ForwardDiff.Dual{
+            typeof(ForwardDiff.Tag(LuxLibTestTag(), eltype(x))), eltype(x),
+            1}.(x, ForwardDiff.Partials.(tuple.(uu)))
+        return vec(ForwardDiff.partials.(vec(f(y)), 1))
+    end
+
+    function jvp_forwarddiff(f, x::ComponentArray, u)
+        xx = getdata(x)
+        uu = vec(u)
+        y = ComponentArray(
+            ForwardDiff.Dual{
+                typeof(ForwardDiff.Tag(LuxLibTestTag(), eltype(x))),
+                eltype(x), 1}.(xx, ForwardDiff.Partials.(tuple.(uu))),
+            getaxes(x))
+        return vec(ForwardDiff.partials.(vec(f(y)), 1))
+    end
+
+    ## This exists exclusively for testing. It has horrifying performance implications
+    function jvp_forwarddiff_concrete(f, x, u)
+        Jₓ = ForwardDiff.jacobian(f, x)
+        return Jₓ * vec(u)
+    end
+
+    function jvp_zygote(f, x, u)
+        Jₓ = only(Zygote.jacobian(f, x))
+        return Jₓ * vec(u)
+    end
+
+    function test_jvp_computation(f, x, u, on_gpu)
+        jvp₁ = jvp_forwarddiff(f, x, u)
+        if !(x isa ComponentArray && on_gpu)
+            # ComponentArray + ForwardDiff on GPU don't play nice
+            jvp₂ = jvp_forwarddiff_concrete(f, x, u)
+            @test check_approx(jvp₁, jvp₂; atol=1e-5, rtol=1e-5)
+
+            jvp₃ = jvp_zygote(f, x, u)
+            @test check_approx(jvp₁, jvp₃; atol=1e-5, rtol=1e-5)
+        end
+    end
+
+    @testset "$(mode): Jacobian Vector Products" for (mode, aType, on_gpu) in MODES
+        @testset "$(op)(; flipped = $flipped)" for flipped in (true, false),
+            op in (depthwiseconv, conv)
+
+            op === depthwiseconv && on_gpu && continue
+
+            input_dims = [(2, 4, 2, 1, 3), (4, 4, 1, 3), (4, 4, 3, 2), (4, 1, 3), (4, 3, 2)]
+            weight_dims = if op === conv
+                [(2, 2, 2, 1, 4), (3, 3, 1, 4), (3, 3, 3, 2), (3, 1, 4), (3, 3, 2)]
+            else
+                [(2, 2, 2, 1, 1), (3, 3, 1, 1), (3, 3, 3, 3), (3, 1, 1), (3, 3, 3)]
+            end
+
+            @testset "Input Dims: $(in_dims) | Weight Dims: $(w_dims)" for (in_dims, w_dims) in zip(
+                input_dims, weight_dims)
+                x = randn(Float32, in_dims...) |> aType
+                w = randn(Float32, w_dims...) |> aType
+                ux = randn(Float32, size(x)...) |> aType
+                uw = randn(Float32, size(w)...) |> aType
+                u = randn(Float32, length(x) + length(w)) |> aType
+
+                test_jvp_computation(x -> op(x, w; flipped), x, ux, on_gpu)
+                test_jvp_computation(w -> op(x, w; flipped), w, uw, on_gpu)
+                test_jvp_computation(xw -> op(xw.x, xw.w; flipped), ComponentArray(; x, w),
+                    u, on_gpu)
+            end
+        end
+    end
+end
+
+@testitem "ForwardDiff dropout" setup=[SharedTestSetup] begin
+    using ForwardDiff
+
+    rng = get_stable_rng(12345)
+
+    @testset "$mode: dropout" for (mode, aType, on_gpu) in MODES
+        x = randn(rng, Float32, 10, 2) |> aType
+        x_dual = ForwardDiff.Dual.(x)
+
+        @test_nowarn dropout(rng, x_dual, 0.5f0, Val(true); dims=:)
+
+        x_dropout = dropout(rng, x, 0.5f0, Val(true); dims=:)[1]
+        x_dual_dropout = ForwardDiff.value.(dropout(rng, x_dual, 0.5f0, Val(true); dims=:)[1])
+
+        @test check_approx(x_dropout, x_dual_dropout)
+    end
+end
diff --git a/test/jvp.jl b/test/jvp.jl
deleted file mode 100644
index 17e72363..00000000
--- a/test/jvp.jl
+++ /dev/null
@@ -1,75 +0,0 @@
-using LuxLib, ForwardDiff, Zygote, Test
-using ComponentArrays
-
-include("test_utils.jl")
-
-struct LuxLibTestTag end
-
-# Computes (∂f/∂x)u
-function jvp_forwarddiff(f, x, u)
-    uu = reshape(u, axes(x))
-    y = ForwardDiff.Dual{typeof(ForwardDiff.Tag(LuxLibTestTag(), eltype(x))), eltype(x),
-        1}.(x, ForwardDiff.Partials.(tuple.(uu)))
-    return vec(ForwardDiff.partials.(vec(f(y)), 1))
-end
-
-function jvp_forwarddiff(f, x::ComponentArray, u)
-    xx = getdata(x)
-    uu = vec(u)
-    y = ComponentArray(ForwardDiff.Dual{typeof(ForwardDiff.Tag(LuxLibTestTag(),
-                eltype(x))), eltype(x), 1}.(xx, ForwardDiff.Partials.(tuple.(uu))),
-        getaxes(x))
-    return vec(ForwardDiff.partials.(vec(f(y)), 1))
-end
-
-## This exists exclusively for testing. It has horrifying performance implications
-function jvp_forwarddiff_concrete(f, x, u)
-    Jₓ = ForwardDiff.jacobian(f, x)
-    return Jₓ * vec(u)
-end
-
-function jvp_zygote(f, x, u)
-    Jₓ = only(Zygote.jacobian(f, x))
-    return Jₓ * vec(u)
-end
-
-function test_jvp_computation(f, x, u, on_gpu)
-    jvp₁ = jvp_forwarddiff(f, x, u)
-    if !(x isa ComponentArray && on_gpu)
-        # ComponentArray + ForwardDiff on GPU don't play nice
-        jvp₂ = jvp_forwarddiff_concrete(f, x, u)
-        @test check_approx(jvp₁, jvp₂; atol=1e-5, rtol=1e-5)
-
-        jvp₃ = jvp_zygote(f, x, u)
-        @test check_approx(jvp₁, jvp₃; atol=1e-5, rtol=1e-5)
-    end
-end
-
-@testset "$mode: Jacobian Vector Products" for (mode, aType, on_gpu) in MODES
-    @testset "$(op)(; flipped = $flipped)" for flipped in (true, false),
-        op in (depthwiseconv, conv)
-
-        op === depthwiseconv && on_gpu && continue
-
-        input_dims = [(2, 4, 2, 1, 3), (4, 4, 1, 3), (4, 4, 3, 2), (4, 1, 3), (4, 3, 2)]
-        weight_dims = if op === conv
-            [(2, 2, 2, 1, 4), (3, 3, 1, 4), (3, 3, 3, 2), (3, 1, 4), (3, 3, 2)]
-        else
-            [(2, 2, 2, 1, 1), (3, 3, 1, 1), (3, 3, 3, 3), (3, 1, 1), (3, 3, 3)]
-        end
-
-        @testset "Input Dims: $(in_dims) | Weight Dims: $(w_dims)" for (in_dims, w_dims) in zip(input_dims,
-            weight_dims)
-            x = randn(Float32, in_dims...) |> aType
-            w = randn(Float32, w_dims...) |> aType
-            ux = randn(Float32, size(x)...) |> aType
-            uw = randn(Float32, size(w)...) |> aType
-            u = randn(Float32, length(x) + length(w)) |> aType
-
-            test_jvp_computation(x -> op(x, w; flipped), x, ux, on_gpu)
-            test_jvp_computation(w -> op(x, w; flipped), w, uw, on_gpu)
-            test_jvp_computation(xw -> op(xw.x, xw.w; flipped), ComponentArray(; x, w), u,
-                on_gpu)
-        end
-    end
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 56b1d384..8ba7978a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,18 +1,3 @@
-using SafeTestsets, Test, TestSetExtensions
+using ReTestItems
 
-@testset ExtendedTestSet "LuxLib" begin
-    @safetestset "Dropout" include("api/dropout.jl")
-
-    @testset "Normalization" begin
-        @safetestset "BatchNorm" include("api/batchnorm.jl")
-        @safetestset "GroupNorm" include("api/groupnorm.jl")
-        @safetestset "InstanceNorm" include("api/instancenorm.jl")
-        @safetestset "LayerNorm" include("api/layernorm.jl")
-    end
-
-    @safetestset "ForwardDiff Extension" include("ext/LuxLibForwardDiffExt.jl")
-
-    @safetestset "Efficient Jacobian-Vector-Products" include("jvp.jl")
-
-    @safetestset "Aqua Tests" include("aqua.jl")
-end
+ReTestItems.runtests(@__DIR__)
diff --git a/test/test_utils.jl b/test/shared_testsetup.jl
similarity index 67%
rename from test/test_utils.jl
rename to test/shared_testsetup.jl
index f671252a..886b20d6 100644
--- a/test/test_utils.jl
+++ b/test/shared_testsetup.jl
@@ -1,8 +1,9 @@
-using LuxLib, LuxTestUtils, StableRNGs, Test, Zygote
-using LuxCUDA, LuxAMDGPU
-using LuxTestUtils: @jet, @test_gradients, check_approx
+@testsetup module SharedTestSetup
+import Reexport: @reexport
 
-CUDA.allowscalar(false)
+using LuxLib, LuxCUDA, LuxAMDGPU
+@reexport using LuxTestUtils, StableRNGs, Test, Zygote
+import LuxTestUtils: @jet, @test_gradients, check_approx
 
 const GROUP = get(ENV, "GROUP", "All")
 
@@ -26,3 +27,7 @@ end
 get_stable_rng(seed=12345) = StableRNG(seed)
 
 __istraining(::Val{training}) where {training} = training
+
+export cpu_testing, cuda_testing, amdgpu_testing, MODES, get_stable_rng, __istraining,
+       check_approx, @jet, @test_gradients
+end