diff --git a/Project.toml b/Project.toml
index 62caa680..aaaccc94 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReliabilityDiagrams = "e5f51471-6270-49e4-a15a-f1cfbff4f856"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 julia = "1"
diff --git a/examples/batchensemble.jl b/examples/batchensemble.jl
new file mode 100644
index 00000000..cb44d2a5
--- /dev/null
+++ b/examples/batchensemble.jl
@@ -0,0 +1,207 @@
+## Classification of MNIST dataset 
+## with the convolutional neural network known as LeNet5.
+## This script also combines various
+## packages from the Julia ecosystem with Flux.
+using Flux
+using Flux.Data: DataLoader
+using Flux.Optimise: Optimiser, WeightDecay
+using Flux: onehotbatch, onecold, glorot_normal, label_smoothing
+using Flux.Losses: logitcrossentropy
+using Statistics, Random
+using Logging: with_logger
+using TensorBoardLogger: TBLogger, tb_overwrite, set_step!, set_step_increment!
+using ProgressMeter: @showprogress
+import MLDatasets
+import BSON
+using CUDA
+using Formatting
+
+using DeepUncertainty
+
+# LeNet5 "constructor". 
+# The model can be adapted to any image size
+# and any number of output classes.
+function LeNet5(args; imgsize = (28, 28, 1), nclasses = 10)
+    out_conv_size = (imgsize[1] ÷ 4 - 3, imgsize[2] ÷ 4 - 3, 16)
+
+    return Chain(
+        ConvBatchEnsemble((5, 5), imgsize[end] => 6, args.rank, args.ensemble_size, relu),
+        MaxPool((2, 2)),
+        ConvBatchEnsemble((5, 5), 6 => 16, args.rank, args.ensemble_size, relu),
+        MaxPool((2, 2)),
+        flatten,
+        DenseBatchEnsemble(prod(out_conv_size), 120, args.rank, args.ensemble_size, relu),
+        DenseBatchEnsemble(120, 84, args.rank, args.ensemble_size, relu),
+        DenseBatchEnsemble(84, nclasses, args.rank, args.ensemble_size),
+    )
+end
+
+function get_data(args)
+    xtrain, ytrain = MLDatasets.MNIST.traindata(Float32)
+    xtest, ytest = MLDatasets.MNIST.testdata(Float32)
+
+    xtrain = reshape(xtrain, 28, 28, 1, :)
+    xtest = reshape(xtest, 28, 28, 1, :)
+
+    ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)
+
+    train_loader = DataLoader(
+        (xtrain, ytrain),
+        batchsize = args.batchsize,
+        shuffle = true,
+        partial = false,
+    )
+    test_loader = DataLoader((xtest, ytest), batchsize = args.batchsize, partial = false)
+
+    return train_loader, test_loader
+end
+
+loss(ŷ, y) = logitcrossentropy(ŷ, y)
+
+function accuracy(preds, labels)
+    acc = sum(onecold(preds |> cpu) .== onecold(labels |> cpu))
+    return acc
+end
+
+function eval_loss_accuracy(args, loader, model, device)
+    l = [0.0f0 for x = 1:args.ensemble_size]
+    acc = [0 for x = 1:args.ensemble_size]
+    ece_list = [0.0f0 for x = 1:args.ensemble_size]
+    ntot = 0
+    mean_l = 0
+    mean_acc = 0
+    mean_ece = 0
+    for (x, y) in loader
+        x = repeat(x, 1, 1, 1, args.ensemble_size)
+        x, y = x |> device, y |> device
+        # Perform the forward pass 
+        ŷ = model(x)
+        ŷ = softmax(ŷ, dims = 1)
+        # Reshape the predictions into [classes, batch_size, ensemble_size
+        reshaped_ŷ = reshape(ŷ, size(ŷ)[1], args.batchsize, args.ensemble_size)
+        # Loop through each model's predictions 
+        for ensemble = 1:args.ensemble_size
+            model_predictions = reshaped_ŷ[:, :, ensemble]
+            # Calculate individual loss 
+            l[ensemble] += loss(model_predictions, y) * size(model_predictions)[end]
+            acc[ensemble] += accuracy(model_predictions, y)
+            ece_list[ensemble] +=
+                ExpectedCalibrationError(model_predictions |> cpu, onecold(y |> cpu)) *
+                args.batchsize
+        end
+        # Get the mean predictions
+        mean_predictions = mean(reshaped_ŷ, dims = ndims(reshaped_ŷ))
+        mean_predictions = dropdims(mean_predictions, dims = ndims(mean_predictions))
+        mean_l += loss(mean_predictions, y) * size(mean_predictions)[end]
+        mean_acc += accuracy(mean_predictions, y)
+        mean_ece +=
+            ExpectedCalibrationError(mean_predictions |> cpu, onecold(y |> cpu)) *
+            args.batchsize
+        ntot += size(mean_predictions)[end]
+    end
+    # Normalize the loss 
+    losses = [loss / ntot |> round4 for loss in l]
+    acc = [a / ntot * 100 |> round4 for a in acc]
+    ece_list = [x / ntot |> round4 for x in ece_list]
+    # Calculate mean loss 
+    mean_l = mean_l / ntot |> round4
+    mean_acc = mean_acc / ntot * 100 |> round4
+    mean_ece = mean_ece / ntot |> round4
+
+    # Print the per ensemble mode loss and accuracy 
+    for ensemble = 1:args.ensemble_size
+        @info (format(
+            "Model {} Loss: {} Accuracy: {} ECE: {}",
+            ensemble,
+            losses[ensemble],
+            acc[ensemble],
+            ece_list[ensemble],
+        ))
+    end
+    @info (format(
+        "Mean Loss: {} Mean Accuracy: {} Mean ECE: {}",
+        mean_l,
+        mean_acc,
+        mean_ece,
+    ))
+    @info "==========================================================="
+    return nothing
+end
+
+## utility functions
+num_params(model) = sum(length, Flux.params(model))
+round4(x) = round(x, digits = 4)
+
+# arguments for the `train` function 
+Base.@kwdef mutable struct Args
+    η = 3e-4             # learning rate
+    λ = 0                # L2 regularizer param, implemented as weight decay
+    batchsize = 32      # batch size
+    epochs = 10          # number of epochs
+    seed = 0             # set seed > 0 for reproducibility
+    use_cuda = true      # if true use cuda (if available)
+    infotime = 1      # report every `infotime` epochs
+    checktime = 5        # Save the model every `checktime` epochs. Set to 0 for no checkpoints.
+    savepath = "runs/"    # results path
+    rank = 1
+    ensemble_size = 4
+end
+
+function train(; kws...)
+    args = Args(; kws...)
+    args.seed > 0 && Random.seed!(args.seed)
+    use_cuda = args.use_cuda && CUDA.functional()
+
+    if use_cuda
+        device = gpu
+        @info "Training on GPU"
+    else
+        device = cpu
+        @info "Training on CPU"
+    end
+
+    ## DATA
+    train_loader, test_loader = get_data(args)
+    @info "Dataset MNIST: $(train_loader.nobs) train and $(test_loader.nobs) test examples"
+
+    ## MODEL AND OPTIMIZER
+    model = LeNet5(args) |> device
+    @info "LeNet5 model: $(num_params(model)) trainable params"
+
+    ps = Flux.params(model)
+
+    opt = ADAM(args.η)
+    if args.λ > 0 # add weight decay, equivalent to L2 regularization
+        opt = Optimiser(WeightDecay(args.λ), opt)
+    end
+
+    function report(epoch)
+        # @info "Train Metrics"
+        # eval_loss_accuracy(args, train_loader, model, device)
+        @info "Test metrics"
+        eval_loss_accuracy(args, test_loader, model, device)
+    end
+
+    ## TRAINING
+    @info "Start Training"
+    report(0)
+    for epoch = 1:args.epochs
+        @showprogress for (x, y) in train_loader
+            # Make copies of batches for ensembles 
+            x = repeat(x, 1, 1, 1, args.ensemble_size)
+            y = repeat(y, 1, args.ensemble_size)
+            x, y = x |> device, y |> device
+            gs = Flux.gradient(ps) do
+                ŷ = model(x)
+                loss(ŷ, y)
+            end
+
+            Flux.Optimise.update!(opt, ps, gs)
+        end
+
+        ## Printing and logging
+        epoch % args.infotime == 0 && report(epoch)
+    end
+end
+
+train()
diff --git a/src/DeepUncertainty.jl b/src/DeepUncertainty.jl
index 5aacd657..74de5589 100644
--- a/src/DeepUncertainty.jl
+++ b/src/DeepUncertainty.jl
@@ -1,10 +1,17 @@
 module DeepUncertainty
 
+using Flux
+using Random
+using Flux: @functor, glorot_normal, create_bias
+
 # Export layers 
 export MCLayer, MCDense, MCConv
+export DenseBatchEnsemble, ConvBatchEnsemble
 export mean_loglikelihood, brier_score, ExpectedCalibrationError, prediction_metrics
 
 include("metrics.jl")
 include("layers/mclayers.jl")
+include("layers/BatchEnsemble/dense.jl")
+include("layers/BatchEnsemble/conv.jl")
 
 end
diff --git a/src/layers/BatchEnsemble/conv.jl b/src/layers/BatchEnsemble/conv.jl
new file mode 100644
index 00000000..564c943b
--- /dev/null
+++ b/src/layers/BatchEnsemble/conv.jl
@@ -0,0 +1,145 @@
+"""
+    ConvBatchEnsemble(filter, in => out, rank, 
+                    ensemble_size, σ = identity;
+                    stride = 1, pad = 0, dilation = 1, 
+                    groups = 1, [bias, weight, init])
+    ConvBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
+
+Creates a conv BatchEnsemble layer. Batch ensemble is a memory efficient alternative 
+for deep ensembles. In deep ensembles, if the ensemble size is N, N different models 
+are trained, making the time and memory complexity O(N * complexity of one network). 
+BatchEnsemble generates weight matrices for each member in the ensemble using a 
+couple of rank 1 vectors R (alpha), S (gamma), RS' and multiplying the result with 
+weight matrix W element wise. We also call R and S as fast weights. 
+
+Reference - https://arxiv.org/abs/2002.06715 
+
+During both training and testing, we repeat the samples along the batch dimension 
+N times, where N is the ensemble_size. For example, if each mini batch has 10 samples 
+and our ensemble size is 4, then the actual input to the layer has 40 samples. 
+The output of the layer has 40 samples as well, and each 10 samples can be considered 
+as the output of an esnemble member. 
+
+# Fields 
+- `layer`: The dense layer which transforms the pertubed input to output 
+- `alpha`: The first Fast weight of size (in_dim, ensemble_size)
+- `gamma`: The second Fast weight of size (out_dim, ensemble_size)
+- `ensemble_bias`: Bias added to the ensemble output, separate from dense layer bias 
+- `ensemble_act`: The activation function to be applied on ensemble output 
+- `rank`: Rank of the fast weights (rank > 1 doesn't work on GPU for now)
+
+# Arguments 
+- `filter::NTuple{N,Integer}`: Kernel dimensions, eg, (5, 5) 
+- `ch::Pair{<:Integer,<:Integer}`: Input channels => output channels 
+- `rank::Integer`: Rank of the fast weights 
+- `ensemble_size::Integer`: Number of models in the ensemble 
+- `σ::F=identity`: Activation of the dense layer, defaults to identity
+- `init=glorot_normal`: Initialization function, defaults to glorot_normal 
+- `alpha_init=glorot_normal`: Initialization function for the alpha fast weight,
+                            defaults to glorot_normal 
+- `gamma_init=glorot_normal`: Initialization function for the gamma fast weight, 
+                            defaults to glorot_normal 
+- `bias::Bool=true`: Toggle the usage of bias in the dense layer 
+- `ensemble_bias::Bool=true`: Toggle the usage of ensemble bias 
+- `ensemble_act::F=identity`: Activation function for enseble outputs 
+"""
+struct ConvBatchEnsemble{L,F,M,B}
+    layer::L
+    alpha::M
+    gamma::M
+    ensemble_bias::B
+    ensemble_act::F
+    rank::Any
+    function ConvBatchEnsemble(
+        layer::L,
+        alpha::M,
+        gamma::M,
+        ensemble_bias = true,
+        ensemble_act::F = identity,
+        rank = 1,
+    ) where {M,F,L}
+        ensemble_bias = create_bias(gamma, ensemble_bias, size(gamma)[1], size(gamma)[2])
+        new{typeof(layer),F,M,typeof(ensemble_bias)}(
+            layer,
+            alpha,
+            gamma,
+            ensemble_bias,
+            ensemble_act,
+            rank,
+        )
+    end
+end
+
+function ConvBatchEnsemble(
+    k::NTuple{N,Integer},
+    ch::Pair{<:Integer,<:Integer},
+    rank::Integer,
+    ensemble_size::Integer,
+    σ = identity;
+    init = glorot_normal,
+    alpha_init = glorot_normal,
+    gamma_init = glorot_normal,
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+    groups = 1,
+    bias = true,
+    ensemble_bias = true,
+    ensemble_act = identity,
+) where {N}
+    layer = Flux.Conv(
+        k,
+        ch,
+        σ;
+        stride = stride,
+        pad = pad,
+        dilation = dilation,
+        init = init,
+        groups = groups,
+        bias = bias,
+    )
+    in_dim = ch[1]
+    out_dim = ch[2]
+    if rank >= 1
+        alpha_shape = (in_dim, ensemble_size)
+        gamma_shape = (out_dim, ensemble_size)
+    else
+        error("Rank must be >= 1.")
+    end
+    alpha = alpha_init(alpha_shape)
+    gamma = gamma_init(gamma_shape)
+
+    return ConvBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
+end
+
+@functor ConvBatchEnsemble
+
+function (be::ConvBatchEnsemble)(x)
+    # Conv Batch Ensemble params 
+    layer = be.layer
+    alpha = be.alpha
+    gamma = be.gamma
+    e_b = be.ensemble_bias
+    e_σ = be.ensemble_act
+
+    batch_size = size(x)[end]
+    in_size = size(alpha)[1]
+    out_size = size(gamma)[1]
+    ensemble_size = size(alpha)[2]
+    samples_per_model = batch_size ÷ ensemble_size
+
+    # Alpha, gamma shapes - [units, ensembles, rank]
+    e_b = repeat(e_b, samples_per_model)
+    alpha = repeat(alpha, samples_per_model)
+    gamma = repeat(gamma, samples_per_model)
+    # Reshape alpha, gamma to [units, batch_size, rank]
+    e_b = reshape(e_b, (1, 1, out_size, batch_size))
+    alpha = reshape(alpha, (1, 1, in_size, batch_size))
+    gamma = reshape(gamma, (1, 1, out_size, batch_size))
+
+    perturbed_x = x .* alpha
+    output = layer(perturbed_x) .* gamma
+    output = e_σ.(output .+ e_b)
+
+    return output
+end
diff --git a/src/layers/BatchEnsemble/dense.jl b/src/layers/BatchEnsemble/dense.jl
new file mode 100644
index 00000000..9ef92921
--- /dev/null
+++ b/src/layers/BatchEnsemble/dense.jl
@@ -0,0 +1,151 @@
+"""
+DenseBatchEnsemble(in, out, rank, 
+                    ensemble_size, 
+                    σ=identity; 
+                    bias=true,
+                    init=glorot_normal, 
+                    alpha_init=glorot_normal, 
+                    gamma_init=glorot_normal)
+DenseBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
+
+Creates a dense BatchEnsemble layer. Batch ensemble is a memory efficient alternative 
+for deep ensembles. In deep ensembles, if the ensemble size is N, N different models 
+are trained, making the time and memory complexity O(N * complexity of one network). 
+BatchEnsemble generates weight matrices for each member in the ensemble using a 
+couple of rank 1 vectors R (alpha), S (gamma), RS' and multiplying the result with 
+weight matrix W element wise. We also call R and S as fast weights. 
+
+Reference - https://arxiv.org/abs/2002.06715 
+
+During both training and testing, we repeat the samples along the batch dimension 
+N times, where N is the ensemble_size. For example, if each mini batch has 10 samples 
+and our ensemble size is 4, then the actual input to the layer has 40 samples. 
+The output of the layer has 40 samples as well, and each 10 samples can be considered 
+as the output of an esnemble member. 
+
+# Fields 
+- `layer`: The dense layer which transforms the pertubed input to output 
+- `alpha`: The first Fast weight of size (in_dim, ensemble_size)
+- `gamma`: The second Fast weight of size (out_dim, ensemble_size)
+- `ensemble_bias`: Bias added to the ensemble output, separate from dense layer bias 
+- `ensemble_act`: The activation function to be applied on ensemble output 
+- `rank`: Rank of the fast weights (rank > 1 doesn't work on GPU for now)
+
+# Arguments 
+- `in::Integer`: Input dimension of features 
+- `out::Integer`: Output dimension of features 
+- `rank::Integer`: Rank of the fast weights 
+- `ensemble_size::Integer`: Number of models in the ensemble 
+- `σ::F=identity`: Activation of the dense layer, defaults to identity
+- `init=glorot_normal`: Initialization function, defaults to glorot_normal 
+- `alpha_init=glorot_normal`: Initialization function for the alpha fast weight,
+                        defaults to glorot_normal 
+- `gamma_init=glorot_normal`: Initialization function for the gamma fast weight, 
+                        defaults to glorot_normal 
+- `bias::Bool=true`: Toggle the usage of bias in the dense layer 
+- `ensemble_bias::Bool=true`: Toggle the usage of ensemble bias 
+- `ensemble_act::F=identity`: Activation function for enseble outputs 
+"""
+struct DenseBatchEnsemble{L,F,M,B}
+    layer::L
+    alpha::M
+    gamma::M
+    ensemble_bias::B
+    ensemble_act::F
+    rank::Any
+    function DenseBatchEnsemble(
+        layer::L,
+        alpha::M,
+        gamma::M,
+        ensemble_bias = true,
+        ensemble_act::F = identity,
+        rank = 1,
+    ) where {M,F,L}
+        ensemble_bias = create_bias(gamma, ensemble_bias, size(gamma)[1], size(gamma)[2])
+        new{typeof(layer),F,M,typeof(ensemble_bias)}(
+            layer,
+            alpha,
+            gamma,
+            ensemble_bias,
+            ensemble_act,
+            rank,
+        )
+    end
+end
+
+function DenseBatchEnsemble(
+    in::Integer,
+    out::Integer,
+    rank::Integer,
+    ensemble_size::Integer,
+    σ = identity;
+    init = glorot_normal,
+    alpha_init = glorot_normal,
+    gamma_init = glorot_normal,
+    bias = true,
+    ensemble_bias = true,
+    ensemble_act = identity,
+)
+
+    layer = Flux.Dense(in, out, σ; init = init, bias = bias)
+    if rank >= 1
+        alpha_shape = (in, ensemble_size, rank)
+        gamma_shape = (out, ensemble_size, rank)
+    else
+        error("Rank must be >= 1.")
+    end
+    alpha = alpha_init(alpha_shape)
+    gamma = gamma_init(gamma_shape)
+
+    return DenseBatchEnsemble(layer, alpha, gamma, ensemble_bias, ensemble_act, rank)
+end
+
+@functor DenseBatchEnsemble
+
+"""
+The forward pass for a DenseBatchEnsemble layer. The input is initially perturbed 
+using the first fast weight, then passed through the dense layer, and finall 
+multiplied by the second fast weight.
+
+# Arguments 
+- `x::AbstractVecOrMat`: Input tensors 
+"""
+function (be::DenseBatchEnsemble)(x)
+    layer = be.layer
+    alpha = be.alpha
+    gamma = be.gamma
+    e_b = be.ensemble_bias
+    e_σ = be.ensemble_act
+    rank = be.rank
+
+    batch_size = size(x)[end]
+    in_size = size(alpha)[1]
+    out_size = size(gamma)[1]
+    ensemble_size = size(alpha)[2]
+    samples_per_model = batch_size ÷ ensemble_size
+
+    # Alpha, gamma shapes - [units, ensembles, rank]
+    alpha = reshape(alpha, (in_size, ensemble_size * rank))
+    gamma = reshape(gamma, (out_size, ensemble_size * rank))
+    # Repeat breaks on GPU when input dims > 2 
+    alpha = repeat(alpha, samples_per_model)
+    gamma = repeat(gamma, samples_per_model)
+    # Reshape alpha, gamma to [units, batch_size, rank]
+    alpha = reshape(alpha, (in_size, batch_size, rank))
+    gamma = reshape(gamma, (out_size, batch_size, rank))
+    # Reshape inputs to [units, batch_size, 1] for broadcasting
+    x = Flux.unsqueeze(x, (ndims(x) + 1))
+    # Perturb the inputs 
+    perturbed_x = x .* alpha
+    # Dense layer forward pass 
+    outputs = layer(perturbed_x) .* gamma
+    # Reduce the rank dimension through summing it up
+    outputs = sum(outputs, dims = 3)
+    outputs = reshape(outputs, (out_size, samples_per_model, ensemble_size))
+    # Reshape ensemble bias 
+    e_b = Flux.unsqueeze(e_b, ndims(e_b))
+
+    outputs = e_σ.(outputs .+ e_b)
+    outputs = reshape(outputs, (out_size, batch_size))
+    return outputs
+end
diff --git a/test/cuda/layers/batchensemble_gpu.jl b/test/cuda/layers/batchensemble_gpu.jl
new file mode 100644
index 00000000..2d6ed695
--- /dev/null
+++ b/test/cuda/layers/batchensemble_gpu.jl
@@ -0,0 +1,77 @@
+@testset "Dense batchensemble" begin
+    ensemble_size = 4
+    samples_per_model = 4
+    input_dim = 5
+    output_dim = 5
+    rank = 1
+    inputs = rand(Float32, input_dim, samples_per_model)
+    layer = DenseBatchEnsemble(
+        input_dim,
+        output_dim,
+        rank,
+        ensemble_size;
+        alpha_init = ones,
+        gamma_init = ones,
+    )
+    layer = layer |> gpu
+    batch_inputs = gpu(repeat(inputs, 1, ensemble_size))
+    batch_outputs = layer(batch_inputs)
+    # Do the computation in for loop to compare outputs 
+    layer = layer |> cpu
+    loop_outputs = []
+    for i = 1:ensemble_size
+        perturbed_inputs = inputs .* layer.alpha[i]
+        outputs = layer.layer(perturbed_inputs) .* layer.gamma[i]
+        outputs = layer.ensemble_act.(outputs .+ layer.ensemble_bias[i])
+        push!(loop_outputs, outputs)
+    end
+    loop_outputs = Flux.batch(loop_outputs)
+    loop_outputs = reshape(loop_outputs, (output_dim, samples_per_model * ensemble_size))
+    @test batch_outputs isa CuArray
+    @test size(batch_outputs) == size(loop_outputs)
+    @test isapprox(cpu(batch_outputs), loop_outputs, atol = 0.05)
+end
+
+@testset "ConvBatchEnsemble" begin
+    ensemble_size = 4
+    samples_per_model = 4
+    input_dim = 5
+    output_dim = 10
+    rank = 1
+    inputs = rand(Float32, 10, 10, input_dim, samples_per_model)
+    beconv = ConvBatchEnsemble(
+        (5, 5),
+        5 => 10,
+        rank,
+        ensemble_size,
+        relu;
+        alpha_init = ones,
+        gamma_init = ones,
+    )
+    beconv = beconv |> gpu
+    batch_inputs = gpu(repeat(inputs, 1, 1, 1, ensemble_size))
+    batch_outputs = beconv(batch_inputs)
+    # Do the computation in for loop to compare outputs 
+    beconv = beconv |> cpu
+    loop_outputs = []
+    for i = 1:ensemble_size
+        perturbed_inputs = inputs .* beconv.alpha[i]
+        outputs = beconv.layer(perturbed_inputs) .* beconv.gamma[i]
+        outputs = beconv.ensemble_act.(outputs .+ beconv.ensemble_bias[i])
+        push!(loop_outputs, outputs)
+    end
+    loop_outputs = Flux.batch(loop_outputs)
+    loop_outputs_size = size(batch_outputs)
+    loop_outputs = reshape(
+        loop_outputs,
+        (
+            loop_outputs_size[1],
+            loop_outputs_size[2],
+            output_dim,
+            samples_per_model * ensemble_size,
+        ),
+    )
+    @test batch_outputs isa CuArray
+    @test size(batch_outputs) == size(loop_outputs)
+    @test isapprox(cpu(batch_outputs), loop_outputs, atol = 0.05)
+end
diff --git a/test/cuda/layers/mclayers_gpu.jl b/test/cuda/layers/mclayers_gpu.jl
new file mode 100644
index 00000000..f176cfbf
--- /dev/null
+++ b/test/cuda/layers/mclayers_gpu.jl
@@ -0,0 +1,37 @@
+function test_sparsity(x, target_sparsity; atol = 0.05)
+    number_of_zeros = count(ele -> (ele == 0.0), x)
+    sparsity = number_of_zeros / sum(length, x)
+    @test isapprox(target_sparsity, sparsity; atol)
+end
+
+@testset "MC Dense GPU" begin
+    dropout_rate = 0.35
+    # Test MC Dense layer 
+    a = gpu(rand(Float32, 8, 32))
+    layer = gpu(MCDense(8, 16, dropout_rate))
+    output = layer(a)
+    # Test if it's CuArray 
+    @test output isa CuArray
+    @test isequal(size(output), (16, 32))
+    test_sparsity(output, dropout_rate)
+    # Test MC dense dropout toggle 
+    output = layer(a, dropout = false)
+    test_sparsity(output, 0)
+end
+
+@testset "MC Conv GPU" begin
+    dropout_rate = 0.4
+    # Test MC conv layer 
+    a = gpu(rand(Float32, 32, 32, 3, 32))
+    layer = MCConv((5, 5), 3 => 6, dropout_rate) |> gpu
+    output = layer(a)
+    # Test if it's CuArray 
+    @test output isa CuArray
+    # Test the output shape 
+    @test isequal(size(output), (28, 28, 6, 32))
+    # Test the sparsity percentage in the array 
+    test_sparsity(output, dropout_rate)
+    # Test MC conv dropout toggle 
+    output = layer(a, dropout = false)
+    test_sparsity(output, 0)
+end
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
new file mode 100644
index 00000000..8e9a4141
--- /dev/null
+++ b/test/cuda/runtests.jl
@@ -0,0 +1,7 @@
+using Flux, Test, CUDA
+
+@info "Testing GPU Support"
+CUDA.allowscalar(false)
+
+include("layers/mclayers_gpu.jl")
+include("layers/batchensemble_gpu.jl")
diff --git a/test/layers/batchensemble.jl b/test/layers/batchensemble.jl
new file mode 100644
index 00000000..4107178e
--- /dev/null
+++ b/test/layers/batchensemble.jl
@@ -0,0 +1,72 @@
+@testset "Dense batchensemble" begin
+    ensemble_size = 4
+    samples_per_model = 4
+    input_dim = 5
+    output_dim = 5
+    rank = 1
+    inputs = rand(Float32, input_dim, samples_per_model)
+    layer = DenseBatchEnsemble(
+        input_dim,
+        output_dim,
+        rank,
+        ensemble_size;
+        alpha_init = ones,
+        gamma_init = ones,
+    )
+    batch_inputs = repeat(inputs, 1, ensemble_size)
+    batch_outputs = layer(batch_inputs)
+    # Do the computation in for loop to compare outputs 
+    loop_outputs = []
+    for i = 1:ensemble_size
+        perturbed_inputs = inputs .* layer.alpha[i]
+        outputs = layer.layer(perturbed_inputs) .* layer.gamma[i]
+        outputs = layer.ensemble_act.(outputs .+ layer.ensemble_bias[i])
+        push!(loop_outputs, outputs)
+    end
+    loop_outputs = Flux.batch(loop_outputs)
+    loop_outputs = reshape(loop_outputs, (output_dim, samples_per_model * ensemble_size))
+    @test size(batch_outputs) == size(loop_outputs)
+    @test isapprox(batch_outputs, loop_outputs, atol = 0.05)
+end
+
+@testset "ConvBatchEnsemble" begin
+    ensemble_size = 4
+    samples_per_model = 4
+    input_dim = 5
+    output_dim = 10
+    rank = 1
+    inputs = rand(Float32, 10, 10, input_dim, samples_per_model)
+    beconv = ConvBatchEnsemble(
+        (5, 5),
+        5 => 10,
+        rank,
+        ensemble_size,
+        relu;
+        alpha_init = ones,
+        gamma_init = ones,
+    )
+    batch_inputs = repeat(inputs, 1, 1, 1, ensemble_size)
+    batch_outputs = beconv(batch_inputs)
+
+    # Do the computation in for loop to compare outputs 
+    loop_outputs = []
+    for i = 1:ensemble_size
+        perturbed_inputs = inputs .* beconv.alpha[i]
+        outputs = beconv.layer(perturbed_inputs) .* beconv.gamma[i]
+        outputs = beconv.ensemble_act.(outputs .+ beconv.ensemble_bias[i])
+        push!(loop_outputs, outputs)
+    end
+    loop_outputs = Flux.batch(loop_outputs)
+    loop_outputs_size = size(batch_outputs)
+    loop_outputs = reshape(
+        loop_outputs,
+        (
+            loop_outputs_size[1],
+            loop_outputs_size[2],
+            output_dim,
+            samples_per_model * ensemble_size,
+        ),
+    )
+    @test size(batch_outputs) == size(loop_outputs)
+    @test isapprox(batch_outputs, loop_outputs, atol = 0.05)
+end
diff --git a/test/layers/mclayers_test.jl b/test/layers/mclayers.jl
similarity index 50%
rename from test/layers/mclayers_test.jl
rename to test/layers/mclayers.jl
index 9d963b4a..1e8c2627 100644
--- a/test/layers/mclayers_test.jl
+++ b/test/layers/mclayers.jl
@@ -1,5 +1,8 @@
-using Test
-using DeepUncertainty: MCDense, MCConv
+function test_sparsity(x, target_sparsity; atol = 0.05)
+    number_of_zeros = count(ele -> (ele == 0.0), x)
+    sparsity = number_of_zeros / sum(length, x)
+    @test isapprox(target_sparsity, sparsity; atol)
+end
 
 @testset "MC Dense" begin
     dropout_rate = 0.35
@@ -7,16 +10,11 @@ using DeepUncertainty: MCDense, MCConv
     a = rand(Float32, 8, 32)
     layer = MCDense(8, 16, dropout_rate)
     output = layer(a)
-    number_of_zeros = count(x -> (x == 0.0), output)
-    sparsity = number_of_zeros / sum(length, output)
     @test isequal(size(output), (16, 32))
-    @test isapprox(dropout_rate, sparsity; atol = 0.05)
-
+    test_sparsity(output, dropout_rate)
     # Test MC dense dropout toggle 
     output = layer(a, dropout = false)
-    number_of_zeros = count(x -> (x == 0.0), output)
-    sparsity = number_of_zeros / sum(length, output)
-    @test isapprox(0, sparsity; atol = 0.05)
+    test_sparsity(output, 0)
 end
 
 @testset "MC Conv" begin
@@ -25,16 +23,11 @@ end
     a = rand(Float32, 32, 32, 3, 32)
     layer = MCConv((5, 5), 3 => 6, dropout_rate)
     output = layer(a)
-    number_of_zeros = count(x -> (x == 0.0), output)
-    sparsity = number_of_zeros / sum(length, output)
     # Test the output shape 
     @test isequal(size(output), (28, 28, 6, 32))
     # Test the sparsity percentage in the array 
-    @test isapprox(dropout_rate, sparsity; atol = 0.05)
-
+    test_sparsity(output, dropout_rate)
     # Test MC conv dropout toggle 
     output = layer(a, dropout = false)
-    number_of_zeros = count(x -> (x == 0.0), output)
-    sparsity = number_of_zeros / sum(length, output)
-    @test isapprox(0, sparsity; atol = 0.05)
+    test_sparsity(output, 0)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 99d1e32a..476d7605 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,20 @@
 using DeepUncertainty
 using Test
+using Flux
+using Flux.CUDA
+using Flux: cpu, gpu
 
 @testset "Layers" begin
-    # Test the layers 
-    include("./layers/mclayers_test.jl")
+    # MC layers 
+    include("./layers/mclayers.jl")
+    # Batch ensembe layers 
+    include("./layers/batchensemble.jl")
+end
+
+@testset "CUDA" begin
+    if CUDA.functional()
+        include("cuda/runtests.jl")
+    else
+        @warn "CUDA unavailable, not testing GPU support"
+    end
 end