TuringLang · devmotion · Dec 14, 2022 · Dec 12, 2022 · Dec 13, 2022 · Dec 13, 2022
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MCMCDiagnosticTools"
 uuid = "be115224-59cd-429b-ad48-344e309966f0"
 authors = ["David Widmann"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"

diff --git a/src/MCMCDiagnosticTools.jl b/src/MCMCDiagnosticTools.jl
@@ -22,6 +22,7 @@ export mcse
 export rafterydiag
 export rstar
 
+include("utils.jl")
 include("bfmi.jl")
 include("discretediag.jl")
 include("ess.jl")

diff --git a/src/rstar.jl b/src/rstar.jl
@@ -4,7 +4,8 @@
         classifier::MLJModelInterface.Supervised,
         samples,
         chain_indices::AbstractVector{Int};
-        subset::Real=0.8,
+        subset::Real=0.7,
+        nsplit::Int=2,
         verbosity::Int=0,
     )
 
@@ -23,26 +24,25 @@ function rstar(
     classifier::MLJModelInterface.Supervised,
     x,
     y::AbstractVector{Int};
-    subset::Real=0.8,
+    subset::Real=0.7,
+    nsplit::Int=2,
     verbosity::Int=0,
 )
     # checks
     MLJModelInterface.nrows(x) != length(y) && throw(DimensionMismatch())
     0 < subset < 1 || throw(ArgumentError("`subset` must be a number in (0, 1)"))
 
+    ysplit = split_chain_indices(y, nsplit)
+
     # randomly sub-select training and testing set
-    N = length(y)
-    Ntrain = round(Int, N * subset)
-    0 < Ntrain < N ||
+    train_ids, test_ids = shuffle_split_stratified(rng, ysplit, subset)
+    0 < length(train_ids) < length(y) ||
         throw(ArgumentError("training and test data subsets must not be empty"))
-    ids = Random.randperm(rng, N)
-    train_ids = view(ids, 1:Ntrain)
-    test_ids = view(ids, (Ntrain + 1):N)
 
     xtable = _astable(x)
 
     # train classifier on training data
-    ycategorical = MLJModelInterface.categorical(y)
+    ycategorical = MLJModelInterface.categorical(ysplit)
     xtrain = MLJModelInterface.selectrows(xtable, train_ids)
     fitresult, _ = MLJModelInterface.fit(
         classifier, verbosity, xtrain, ycategorical[train_ids]
@@ -79,7 +79,8 @@ end
         rng::Random.AbstractRNG=Random.default_rng(),
         classifier::MLJModelInterface.Supervised,
         samples::AbstractArray{<:Real,3};
-        subset::Real=0.8,
+        subset::Real=0.7,
+        nsplit::Int=2,
         verbosity::Int=0,
     )
 
@@ -91,8 +92,9 @@ This implementation is an adaption of algorithms 1 and 2 described by Lambert an
 
 The `classifier` has to be a supervised classifier of the MLJ framework (see the
 [MLJ documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/list_of_supported_models/#model_list)
-for a list of supported models). It is trained with a `subset` of the samples. The training
-of the classifier can be inspected by adjusting the `verbosity` level.
+for a list of supported models). It is trained with a `subset` of the samples. Each chain
+is split into `nsplit` separate chains to additionally check for within-chain convergence.
+The training of the classifier can be inspected by adjusting the `verbosity` level.
 
 If the classifier is deterministic, i.e., if it predicts a class, the value of the ``R^*``
 statistic is returned (algorithm 1). If the classifier is probabilistic, i.e., if it outputs

diff --git a/src/utils.jl b/src/utils.jl
@@ -0,0 +1,80 @@
+"""
+    indices_of_unique(x) -> Dict
+
+Return a `Dict` whose keys are the unique elements of `x` and whose values are the
+corresponding indices in `x`.
+"""
+function indices_of_unique(x)
+    d = Dict{eltype(x), Vector{Int}}()
+    for (i, xi) in enumerate(x)
+        if haskey(d, xi)
+            push!(d[xi], i)
+        else
+            d[xi] = [i]
+        end
-        if haskey(d, xi)
-            push!(d[xi], i)
-        else
-            d[xi] = [i]
-        end
+        d_xi = get!(d, xi) do
+            return Int[]
+        end
+        push!(d_xi, i)
-        if haskey(d, xi)
-            push!(d[xi], i)
-        else
-            d[xi] = [i]
-        end
+        d_xi = get!(d, xi) do
+            return Int[]
+        end
+        push!(d_xi, i)
+    end
+    return d
+end
+
+"""
+    split_chain_indices(
+        chain_inds::AbstractVector{Int},
+        nsplit::Int=2,
+    ) -> AbstractVector{Int}
+
+Split each chain in `chain_inds` into `nsplit` chains.
+
+For each chain in `chain_inds`, all entries are assumed to correspond to draws that have
+been ordered by iteration number. The result is a vector of the same length as `chain_inds`
+where each entry is the new index of the chain that the corresponding draw belongs to.
+"""
+function split_chain_indices(c::AbstractVector{<:Int}, nsplit::Int=2)
+    cnew = similar(c)
+    if nsplit == 1
+        copyto!(cnew, c)
+        return cnew
+    end
+    chain_indices = indices_of_unique(c)
+    chain_ind = 0
+    for chain in sort(collect(keys(chain_indices)))
+        inds = chain_indices[chain]
+        ndraws_per_split, rem = divrem(length(inds), nsplit)
+        ilast = 0
+        for j in 1:nsplit
+            chain_ind += 1
+            ndraws_this_split = ndraws_per_split + (j ≤ rem)
+            i = ilast + 1
+            ilast = i + ndraws_this_split - 1
+            @views cnew[inds[i:ilast]] .= chain_ind
+        end
+    end
+    return cnew
+end
+
+"""
+    shuffle_split_stratified(
+        rng::Random.AbstractRNG,
+        group_ids::AbstractVector,
+        frac::Real,
+    ) -> (inds1, inds2)
+
+Randomly split the indices of `group_ids` into two groups, where `frac` indices from each
+group are in `inds1` and the remainder are in `inds2`.
+
+This is used, for example, to split data into training and test data while preserving the
+class balances.
+"""
+function shuffle_split_stratified(rng::Random.AbstractRNG, groups::AbstractVector, frac::Real)
+    inds1 = Int[]
+    inds2 = Int[]
+    group_indices = indices_of_unique(groups)
+    for group in keys(group_indices)
+        inds = group_indices[group]
+        N = length(inds)
+        N1 = round(Int, N * frac)
+        ids = Random.randperm(rng, N)
+        @views append!(inds1, inds[ids[1:N1]])
+        @views append!(inds2, inds[ids[(N1 + 1):N]])
+    end
+    return inds1, inds2
+end
diff --git a/test/rstar.jl b/test/rstar.jl
@@ -30,7 +30,7 @@ const xgboost_deterministic = Pipeline(XGBoostClassifier(); operation=predict_mo
                 @test dist isa LocationScale
                 @test dist.ρ isa PoissonBinomial
                 @test minimum(dist) == 0
-                @test maximum(dist) == 3
+                @test maximum(dist) == 6
             end
             @test mean(dist) ≈ 1 rtol = 0.2
             wrapper === Vector && break
@@ -48,7 +48,7 @@ const xgboost_deterministic = Pipeline(XGBoostClassifier(); operation=predict_mo
                 @test dist isa LocationScale
                 @test dist.ρ isa PoissonBinomial
                 @test minimum(dist) == 0
-                @test maximum(dist) == 4
+                @test maximum(dist) == 8
             end
             @test mean(dist) ≈ 1 rtol = 0.15
 
@@ -58,7 +58,7 @@ const xgboost_deterministic = Pipeline(XGBoostClassifier(); operation=predict_mo
                 100 .* cos.(1:N) 100 .* sin.(1:N)
             ])
             chain_indices = repeat(1:2; inner=N)
-            dist = rstar(classifier, samples, chain_indices)
+            dist = rstar(classifier, samples, chain_indices; nsplit=1)
 
             # Mean of the statistic should be close to 2, i.e., the classifier should be able to
             # learn an almost perfect decision boundary between chains.
@@ -71,6 +71,17 @@ const xgboost_deterministic = Pipeline(XGBoostClassifier(); operation=predict_mo
                 @test maximum(dist) == 2
             end
             @test mean(dist) ≈ 2 rtol = 0.15
+
+            # Compute the R⋆ statistic for identical chains that individually have not mixed.
+            samples = ones(sz)
+            samples[div(N, 2):end, :] .= 2
+            chain_indices = repeat(1:4; outer=div(N, 4))
+            dist = rstar(classifier, samples, chain_indices; nsplit=1)
+            # without split chains cannot distinguish between chains
+            @test mean(dist) ≈ 1 rtol = 0.15
+            dist = rstar(classifier, samples, chain_indices)
+            # with split chains can learn almost perfect decision boundary
+            @test mean(dist) ≈ 2 rtol = 0.15
         end
         wrapper === Vector && continue
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -10,6 +10,10 @@ using Test
 Random.seed!(1)
 
 @testset "MCMCDiagnosticTools.jl" begin
+    @testset "utils" begin
+        include("utils.jl")
+    end
+
     @testset "Bayesian fraction of missing information" begin
         include("bfmi.jl")
     end

diff --git a/test/utils.jl b/test/utils.jl
@@ -0,0 +1,48 @@
+using MCMCDiagnosticTools
+using Test
+using Random
+
+@testset "indices_of_unique" begin
+    inds = [1, 4, 3, 1, 4, 1, 3, 3, 4, 2, 1, 4, 1, 1, 3, 2, 3, 4, 4, 2]
+    d = MCMCDiagnosticTools.indices_of_unique(inds)
+    @test d isa Dict{Int, Vector{Int}}
+    @test issetequal(union(values(d)...), eachindex(inds))
+    for k in keys(d)
+        @test all(inds[d[k]] .== k)
+    end
+end
+
+@testset "split_chain_indices" begin
+    c = [2, 2, 1, 3, 4, 3, 4, 1, 2, 1, 4, 3, 3, 2, 4, 3, 4, 1, 4, 1]
+    @test @inferred(MCMCDiagnosticTools.split_chain_indices(c, 1)) == c
+
+    cnew = @inferred MCMCDiagnosticTools.split_chain_indices(c, 2)
+    d = MCMCDiagnosticTools.indices_of_unique(c)
+    dnew = MCMCDiagnosticTools.indices_of_unique(cnew)
+    for (i, inew) in enumerate(1:2:7)
+        @test length(dnew[inew]) ≥ length(dnew[inew + 1])
+        @test d[i] == vcat(dnew[inew], dnew[inew + 1])
+    end
+
+    cnew = MCMCDiagnosticTools.split_chain_indices(c, 3)
+    d = MCMCDiagnosticTools.indices_of_unique(c)
+    dnew = MCMCDiagnosticTools.indices_of_unique(cnew)
+    for (i, inew) in enumerate(1:3:11)
+        @test length(dnew[inew]) ≥ length(dnew[inew + 1]) ≥ length(dnew[inew + 2])
+        @test d[i] == vcat(dnew[inew], dnew[inew + 1], dnew[inew + 2])
+    end
+end
+
+@testset "shuffle_split_stratified" begin
+    rng = Random.default_rng()
+    c = rand(1:4, 100)
+    d = MCMCDiagnosticTools.indices_of_unique(c)
+    @testset "frac=$frac" for frac in [0.3, 0.5, 0.7]
+        inds1, inds2 = @inferred(MCMCDiagnosticTools.shuffle_split_stratified(rng, c, frac))
+        @test issetequal(vcat(inds1, inds2), eachindex(c))
+        for i in 1:4
+            common_inds = intersect(inds1, d[i])
+            @test length(common_inds) == round(frac * length(d[i]))
+        end
+    end
+end