TuringLang · devmotion · Jan 17, 2023 · Jan 3, 2023 · Jan 4, 2023 · Jan 4, 2023
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -14,7 +14,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.3'
+          - '1.6'
           - '1'
           - 'nightly'
         os:

diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,5 @@
 *.jl.*.cov
 *.jl.cov
 *.jl.mem
-/Manifest.toml
-/test/Manifest.toml
-/test/rstar/Manifest.toml
+Manifest.toml
 /docs/build/
diff --git a/Project.toml b/Project.toml
@@ -27,7 +27,7 @@ SpecialFunctions = "0.8, 0.9, 0.10, 1, 2"
 StatsBase = "0.33"
 StatsFuns = "1"
 Tables = "1"
-julia = "1.3"
+julia = "1.6"
 
 [extras]
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,14 +1,16 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 MCMCDiagnosticTools = "be115224-59cd-429b-ad48-344e309966f0"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
-MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91"
+MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 Documenter = "0.27"
+EvoTrees = "0.14.6"
 MCMCDiagnosticTools = "0.2"
 MLJBase = "0.19, 0.20, 0.21"
-MLJXGBoostInterface = "0.1, 0.2, 0.3"
-julia = "1.3"
+MLJIteration = "0.5"
+julia = "1.6"
diff --git a/src/rstar.jl b/src/rstar.jl
@@ -40,21 +40,20 @@ function rstar(
         throw(ArgumentError("training and test data subsets must not be empty"))
 
     xtable = _astable(x)
+    ycategorical = MLJModelInterface.categorical(ysplit)
+    xdata, ydata = MLJModelInterface.reformat(classifier, xtable, ycategorical)
 
     # train classifier on training data
-    ycategorical = MLJModelInterface.categorical(ysplit)
-    xtrain = MLJModelInterface.selectrows(xtable, train_ids)
-    fitresult, _ = MLJModelInterface.fit(
-        classifier, verbosity, xtrain, ycategorical[train_ids]
-    )
+    xtrain, ytrain = MLJModelInterface.selectrows(classifier, train_ids, xdata, ydata)
+    fitresult, _ = MLJModelInterface.fit(classifier, verbosity, xtrain, ytrain)
 
     # compute predictions on test data
-    xtest = MLJModelInterface.selectrows(xtable, test_ids)
+    xtest, = MLJModelInterface.selectrows(classifier, test_ids, xdata)
+    ytest = ycategorical[test_ids]
     predictions = _predict(classifier, fitresult, xtest)
 
     # compute statistic
-    ytest = ycategorical[test_ids]
-    result = _rstar(predictions, ytest)
+    result = _rstar(classifier, predictions, ytest)
 
     return result
 end
@@ -109,7 +108,7 @@ is returned (algorithm 2).
 # Examples
 
 ```jldoctest rstar; setup = :(using Random; Random.seed!(101))
-julia> using MLJBase, MLJXGBoostInterface, Statistics
+julia> using MLJBase, MLJIteration, EvoTrees, Statistics
 
 julia> samples = fill(4.0, 100, 3, 2);
 ```
@@ -118,7 +117,16 @@ One can compute the distribution of the ``R^*`` statistic (algorithm 2) with the
 probabilistic classifier.
 
 ```jldoctest rstar
-julia> distribution = rstar(XGBoostClassifier(), samples);
+julia> model = IteratedModel(;
+           model=EvoTreeClassifier(; eta=0.005),
+           iteration_parameter=:nrounds,
+           resampling=Holdout(),
+           measures=log_loss,
+           controls=[Step(5), Patience(2), NumberLimit(100)],
+           retrain=true,
+       );
+
+julia> distribution = rstar(model, samples);
 
 julia> isapprox(mean(distribution), 1; atol=0.1)
 true
@@ -129,9 +137,9 @@ Deterministic classifiers can also be derived from probabilistic classifiers by
 predicting the mode. In MLJ this corresponds to a pipeline of models.
 
 ```jldoctest rstar
-julia> xgboost_deterministic = Pipeline(XGBoostClassifier(); operation=predict_mode);
+julia> evotree_deterministic = Pipeline(model; operation=predict_mode);
 
-julia> value = rstar(xgboost_deterministic, samples);
+julia> value = rstar(evotree_deterministic, samples);
 
 julia> isapprox(value, 1; atol=0.2)
 true
@@ -161,7 +169,9 @@ function rstar(classif::MLJModelInterface.Supervised, x::AbstractArray{<:Any,3};
 end
 
 # R⋆ for deterministic predictions (algorithm 1)
-function _rstar(predictions::AbstractVector{T}, ytest::AbstractVector{T}) where {T}
+function _rstar(
+    ::MLJModelInterface.Deterministic, predictions::AbstractVector, ytest::AbstractVector
+)
     length(predictions) == length(ytest) ||
         error("numbers of predictions and targets must be equal")
     mean_accuracy = Statistics.mean(p == y for (p, y) in zip(predictions, ytest))
@@ -170,7 +180,9 @@ function _rstar(predictions::AbstractVector{T}, ytest::AbstractVector{T}) where
 end
 
 # R⋆ for probabilistic predictions (algorithm 2)
-function _rstar(predictions::AbstractVector, ytest::AbstractVector)
+function _rstar(
+    ::MLJModelInterface.Probabilistic, predictions::AbstractVector, ytest::AbstractVector
+)
     length(predictions) == length(ytest) ||
         error("numbers of predictions and targets must be equal")
 

diff --git a/test/Project.toml b/test/Project.toml
@@ -1,14 +1,15 @@
 [deps]
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DynamicHMC = "bbc10e6e-7c05-544b-b16e-64fede858acb"
+EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c"
 LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 MCMCDiagnosticTools = "be115224-59cd-429b-ad48-344e309966f0"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55"
 MLJLIBSVMInterface = "61c7150f-6c77-4bb1-949c-13197eac2a52"
 MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
@@ -18,13 +19,15 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [compat]
 Distributions = "0.25"
 DynamicHMC = "3"
+EvoTrees = "0.14.6"
 FFTW = "1.1"
 LogDensityProblems = "0.12, 1, 2"
 LogExpFunctions = "0.3"
 MCMCDiagnosticTools = "0.2"
 MLJBase = "0.19, 0.20, 0.21"
-MLJLIBSVMInterface = "0.1, 0.2"
-MLJXGBoostInterface = "0.1, 0.2, 0.3"
+MLJIteration = "0.5"
+MLJLIBSVMInterface = "0.2"
+MLJXGBoostInterface = "0.3"
 StatsBase = "0.33"
 Tables = "1"
-julia = "1.3"
+julia = "1.6"
diff --git a/test/rstar.jl b/test/rstar.jl
@@ -1,6 +1,7 @@
 using MCMCDiagnosticTools
 
 using Distributions
+using EvoTrees
 using MLJBase
 using MLJLIBSVMInterface
 using MLJXGBoostInterface
@@ -9,13 +10,27 @@ using Tables
 using Random
 using Test
 
-const xgboost_deterministic = Pipeline(XGBoostClassifier(); operation=predict_mode)
+# XGBoost errors on 32bit systems: https://github.com/dmlc/XGBoost.jl/issues/92
+const XGBoostClassifiers = if Sys.WORD_SIZE == 64
+    (
+        XGBoostClassifier(),
+        Pipeline(XGBoostClassifier(); operation=predict_mode),
+    )
+else
+    ()    
+end
 
 @testset "rstar.jl" begin
-    classifiers = (XGBoostClassifier(), xgboost_deterministic, SVC())
     N = 1_000
 
     @testset "samples input type: $wrapper" for wrapper in [Vector, Array, Tables.table]
+        # In practice, probably you want to use EvoTreeClassifier with early stopping
+        classifiers = (
+            EvoTreeClassifier(; nrounds=100, eta=0.3),
+            Pipeline(EvoTreeClassifier(; nrounds=100, eta=0.3); operation=predict_mode),
+            SVC(),
+            XGBoostClassifiers...,
+        )
         @testset "examples (classifier = $classifier)" for classifier in classifiers
             sz = wrapper === Vector ? N : (N, 2)
             # Compute R⋆ statistic for a mixed chain.
@@ -111,8 +126,18 @@ const xgboost_deterministic = Pipeline(XGBoostClassifier(); operation=predict_mo
             i += 1
         end
 
+        # In practice, probably you want to use EvoTreeClassifier with early stopping
+        rng = MersenneTwister(42)
+        classifiers = (
+            EvoTreeClassifier(; rng=rng, nrounds=100, eta=0.3),
+            Pipeline(
+                EvoTreeClassifier(; rng=rng, nrounds=100, eta=0.3); operation=predict_mode
+            ),
+            SVC(),
+            XGBoostClassifiers...,
+        )
         @testset "classifier = $classifier" for classifier in classifiers
-            rng = MersenneTwister(42)
+            Random.seed!(rng, 42)
             dist1 = rstar(rng, classifier, samples_mat, chain_inds)
             Random.seed!(rng, 42)
             dist2 = rstar(rng, classifier, samples)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,3 @@
-using Pkg
-
 using MCMCDiagnosticTools
 using FFTW
 
@@ -40,11 +38,6 @@ Random.seed!(1)
         include("rafterydiag.jl")
     end
     @testset "R⋆ diagnostic" begin
-        # XGBoost errors on 32bit systems: https://github.com/dmlc/XGBoost.jl/issues/92
-        if Sys.WORD_SIZE == 64
-            include("rstar.jl")
-        else
-            @info "R⋆ not tested: requires 64bit architecture"
-        end
+        include("rstar.jl")
     end
 end
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,7 +14,7 @@ jobs: @@
           fail-fast: false
           matrix:
             version:
-              - '1.3'
+              - '1.6'
               - '1'
               - 'nightly'
             os:
@@ Expand Down @@