From dbf0b775cda2905ffa2d9345b3e6cc179b19e765 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 16 Jan 2024 13:31:10 +1100 Subject: [PATCH 1/5] add StatisticalMeasures as test dep --- Project.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 16cf07b..846ecc4 100644 --- a/Project.toml +++ b/Project.toml @@ -22,8 +22,9 @@ julia = "1.6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["LinearAlgebra", "MLJBase", "StableRNGs", "Statistics", "Test"] +test = ["LinearAlgebra", "MLJBase", "StableRNGs", "StatisticalMeasures", "Statistics", "Test"] From dae98f8292ddf9ddf756e9533bd51fe51309d6a5 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 16 Jan 2024 13:32:38 +1100 Subject: [PATCH 2/5] expose raw GLM fitted model in report --- src/MLJGLMInterface.jl | 37 +++++++++++++++++++++++++++++-------- test/runtests.jl | 28 +++++++++++++++++----------- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index 0315bac..d91bba1 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -45,7 +45,15 @@ const LCR_DESCR = "Linear count regressor with specified "* # LinearBinaryClassifier --> Probabilistic w Binary target // logit,cauchit,.. # MulticlassClassifier --> Probabilistic w Multiclass target -const VALID_KEYS = [:deviance, :dof_residual, :stderror, :vcov, :coef_table] +const VALID_KEYS = [ + :deviance, + :dof_residual, + :stderror, + :vcov, + :coef_table, + :raw_glm_model, +] +const VALID_KEYS_LIST = join(map(k-> ":$k", VALID_KEYS), ", ", " and ") const DEFAULT_KEYS = VALID_KEYS # For more understandable warning mssg by `@mlj_model`. const KEYS_TYPE = Union{Nothing, AbstractVector{Symbol}} @@ -287,6 +295,10 @@ function glm_report(glm_model, features, reportkeys) end report_dict[:coef_table] = coef_table end + if :raw_glm_model in reportkeys + report_dict[:raw_glm_model] = glm_model + end + return NamedTuple{Tuple(keys(report_dict))}(values(report_dict)) end @@ -590,8 +602,7 @@ Here An offset is a variable which is known to have a coefficient of 1. - `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: vector of keys to be used in - the report. Should be one of: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, - `:coef_table`. + the report. Possible keys are: $VALID_KEYS_LIST. Train the machine using `fit!(mach, rows=...)`. @@ -619,7 +630,8 @@ The fields of `fitted_params(mach)` are: # Report -When all keys are enabled in `report_keys`, the following fields are available in `report(mach)`: +When all keys are enabled in `report_keys`, the following fields are available in +`report(mach)`: - `deviance`: Measure of deviance of fitted model with respect to a perfectly fitted model. For a linear model, this is the weighted @@ -634,6 +646,9 @@ When all keys are enabled in `report_keys`, the following fields are available i - `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals. +- `raw_glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training + data. + # Examples ``` @@ -713,8 +728,8 @@ Train the machine using `fit!(mach, rows=...)`. - `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit. -- `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: keys to be used in the report. Should - be one of: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table`. +- `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: keys to be used in the + report. Possible keys are: $VALID_KEYS_LIST. # Operations @@ -750,6 +765,9 @@ The fields of `report(mach)` are: - `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals. +- `raw_glm_model`: The raw fitted model returned by `GLM.glm`. Note this points to training + data. + # Examples ``` @@ -842,8 +860,8 @@ Train the machine using `fit!(mach, rows=...)`. - `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit. -- `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: keys to be used in the report. Should - be one of: `:deviance`, `:dof_residual`, `:stderror`, `:vcov`, `:coef_table`. +- `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: keys to be used in the + report. Possible keys are: $VALID_KEYS_LIST. # Operations @@ -880,6 +898,9 @@ The fields of `report(mach)` are: - `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals. +- `raw_glm_model`: The raw fitted model returned by `GLM.glm`. Note this points to training + data. + # Examples diff --git a/test/runtests.jl b/test/runtests.jl index 519cef4..8fff83d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,7 @@ using Test using MLJBase +using StatisticalMeasures using LinearAlgebra using Statistics using MLJGLMInterface @@ -66,7 +67,7 @@ expit(X) = 1 ./ (1 .+ exp.(-X)) @test hyp_types[2] == "Bool" @test hyp_types[3] == "Union{Nothing, Symbol}" @test hyp_types[4] == "Union{Nothing, AbstractVector{Symbol}}" - + end ### @@ -86,18 +87,18 @@ end fitresult, _, report = fit(lr, 1, X, y) yhat = predict(lr, fitresult, X) - @test mean(cross_entropy(yhat, y)) < 0.25 + @test cross_entropy(yhat, y) < 0.25 fitresult1, _, report1 = fit(pr, 1, X, y) yhat1 = predict(pr, fitresult1, X) - @test mean(cross_entropy(yhat1, y)) < 0.25 + @test cross_entropy(yhat1, y) < 0.25 fitresultw, _, reportw = fit(lr, 1, X, y, w) yhatw = predict(lr, fitresultw, X) - @test mean(cross_entropy(yhatw, y)) < 0.25 + @test cross_entropy(yhatw, y) < 0.25 @test yhatw ≈ yhat fitresultw1, _, reportw1 = fit(pr, 1, X, y, w) yhatw1 = predict(pr, fitresultw1, X) - @test mean(cross_entropy(yhatw1, y)) < 0.25 + @test cross_entropy(yhatw1, y) < 0.25 @test yhatw1 ≈ yhat1 # check predict on `Xnew` with wrong dims @@ -124,6 +125,7 @@ end @test hyper_params[6] == :rtol @test hyper_params[7] == :minstepfac @test hyper_params[8] == :report_keys + end ### @@ -150,7 +152,7 @@ end fitresultw, _, _ = fit(lcr, 1, XTable, y, w) θ̂w = fitted_params(lcr, fitresultw).coef @test norm(θ̂w .- θ)/norm(θ) ≤ 0.03 - @test θ̂w ≈ θ̂ + @test θ̂w ≈ θ̂ # check predict on `Xnew` with wrong dims Xnew = MLJBase.table( @@ -278,7 +280,7 @@ end N = 1000 rng = StableRNGs.StableRNG(0) X = MLJBase.table(rand(rng, N, 3)) - y = 2*X.x1 + X.x2 - X.x3 + rand(rng, Normal(0,1), N) + y = 2*X.x1 + X.x2 - X.x3 + rand(rng, Normal(0,1), N) lr = LinearRegressor(fit_intercept=false, offsetcol=:x2) fitresult, _, report = fit(lr, 1, X, y) @@ -312,7 +314,7 @@ end @test parameters == ["a", "b", "c", "(Intercept)"] intercept = ctable.cols[1][4] yhat = predict(lr, fitresult, X) - @test mean(cross_entropy(yhat, y)) < 0.6 + @test cross_entropy(yhat, y) < 0.6 fp = fitted_params(lr, fitresult) @test fp.features == [:a, :b, :c] @@ -326,18 +328,22 @@ end # check that by default all possible keys are added in the report lr = LinearBinaryClassifier() _, _, report = fit(lr, 1, X, y) - @test :deviance in keys(report) + @test :deviance in keys(report) @test :dof_residual in keys(report) @test :stderror in keys(report) @test :vcov in keys(report) @test :coef_table in keys(report) + @test :raw_glm_model in keys(report) + + @test report.raw_glm_model isa GLM.GeneralizedLinearModel # check that report is valid if only some keys are specified lr = LinearBinaryClassifier(report_keys = [:stderror, :deviance]) _, _, report = fit(lr, 1, X, y) - @test :deviance in keys(report) + @test :deviance in keys(report) @test :stderror in keys(report) - @test :dof_residual ∉ keys(report) + @test :dof_residua ∉ keys(report) + @test :raw_glm_model ∉ keys(report) # check that an empty `NamedTuple` is outputed for # `report_params === nothing` From 85daef81c5447097cbe8b65b72ed828ba22af2c2 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 17 Jan 2024 07:42:46 +1100 Subject: [PATCH 3/5] exclude raw_glm_model from report by default --- src/MLJGLMInterface.jl | 16 ++++++++-------- test/runtests.jl | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index d91bba1..8a67a49 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -53,8 +53,8 @@ const VALID_KEYS = [ :coef_table, :raw_glm_model, ] -const VALID_KEYS_LIST = join(map(k-> ":$k", VALID_KEYS), ", ", " and ") -const DEFAULT_KEYS = VALID_KEYS # For more understandable warning mssg by `@mlj_model`. +const VALID_KEYS_LIST = join(map(k-> "`:$k`", VALID_KEYS), ", ", " and ") +const DEFAULT_KEYS = setdiff(VALID_KEYS, [:raw_glm_model,]) const KEYS_TYPE = Union{Nothing, AbstractVector{Symbol}} @mlj_model mutable struct LinearRegressor <: MMI.Probabilistic @@ -601,8 +601,8 @@ Here - `offsetcol=nothing`: Name of the column to be used as an offset, if any. An offset is a variable which is known to have a coefficient of 1. -- `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: vector of keys to be used in - the report. Possible keys are: $VALID_KEYS_LIST. +- `report_keys`: `Vector` of keys for the report. Possible keys are: $VALID_KEYS_LIST. By + default only `:raw_glm_model` is excluded. Train the machine using `fit!(mach, rows=...)`. @@ -728,8 +728,8 @@ Train the machine using `fit!(mach, rows=...)`. - `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit. -- `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: keys to be used in the - report. Possible keys are: $VALID_KEYS_LIST. +- `report_keys`: `Vector` of keys for the report. Possible keys are: $VALID_KEYS_LIST. By + default only `:raw_glm_model` is excluded. # Operations @@ -860,8 +860,8 @@ Train the machine using `fit!(mach, rows=...)`. - `minstepfac::Real=0.001`: Minimum step fraction. Must be between 0 and 1. Lower bound for the factor used to update the linear fit. -- `report_keys::Union{Symbol, Nothing}=DEFAULT_KEYS`: keys to be used in the - report. Possible keys are: $VALID_KEYS_LIST. +- `report_keys`: `Vector` of keys for the report. Possible keys are: $VALID_KEYS_LIST. By + default only `:raw_glm_model` is excluded. # Operations diff --git a/test/runtests.jl b/test/runtests.jl index 8fff83d..b63b363 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -325,7 +325,8 @@ end @testset "Param names in report" begin X = (a=[1, 4, 3, 1], b=[2, 0, 1, 4], c=[7, 1, 7, 3]) y = categorical([true, false, true, false]) - # check that by default all possible keys are added in the report + # check that by default all possible keys are added in the report, + # except raw_glm_model: lr = LinearBinaryClassifier() _, _, report = fit(lr, 1, X, y) @test :deviance in keys(report) @@ -333,17 +334,16 @@ end @test :stderror in keys(report) @test :vcov in keys(report) @test :coef_table in keys(report) - @test :raw_glm_model in keys(report) - - @test report.raw_glm_model isa GLM.GeneralizedLinearModel + @test :raw_glm_model ∉ keys(report) # check that report is valid if only some keys are specified - lr = LinearBinaryClassifier(report_keys = [:stderror, :deviance]) + lr = LinearBinaryClassifier(report_keys = [:stderror, :raw_glm_model]) _, _, report = fit(lr, 1, X, y) - @test :deviance in keys(report) + @test :deviance ∉ keys(report) @test :stderror in keys(report) - @test :dof_residua ∉ keys(report) - @test :raw_glm_model ∉ keys(report) + @test :dof_residual ∉ keys(report) + @test :raw_glm_model in keys(report) + @test report.raw_glm_model isa GLM.GeneralizedLinearModel # check that an empty `NamedTuple` is outputed for # `report_params === nothing` From 12c9ff5be848bcab5088b910e6d3be883019500d Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 17 Jan 2024 07:43:26 +1100 Subject: [PATCH 4/5] bump 0.3.6 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 846ecc4..1cf59e7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJGLMInterface" uuid = "caf8df21-4939-456d-ac9c-5fefbfb04c0c" authors = ["Anthony D. Blaom "] -version = "0.3.5" +version = "0.3.6" [deps] Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" From 6bba7aa3bac619dd99ef9ec973c0fde9f2fb3b96 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 19 Jan 2024 08:57:57 +1300 Subject: [PATCH 5/5] raw_glm_model -> glm_model --- src/MLJGLMInterface.jl | 26 +++++++++++++------------- test/runtests.jl | 10 +++++----- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index 8a67a49..7a682e3 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -51,10 +51,10 @@ const VALID_KEYS = [ :stderror, :vcov, :coef_table, - :raw_glm_model, + :glm_model, ] const VALID_KEYS_LIST = join(map(k-> "`:$k`", VALID_KEYS), ", ", " and ") -const DEFAULT_KEYS = setdiff(VALID_KEYS, [:raw_glm_model,]) +const DEFAULT_KEYS = setdiff(VALID_KEYS, [:glm_model,]) const KEYS_TYPE = Union{Nothing, AbstractVector{Symbol}} @mlj_model mutable struct LinearRegressor <: MMI.Probabilistic @@ -295,8 +295,8 @@ function glm_report(glm_model, features, reportkeys) end report_dict[:coef_table] = coef_table end - if :raw_glm_model in reportkeys - report_dict[:raw_glm_model] = glm_model + if :glm_model in reportkeys + report_dict[:glm_model] = glm_model end return NamedTuple{Tuple(keys(report_dict))}(values(report_dict)) @@ -602,7 +602,7 @@ Here An offset is a variable which is known to have a coefficient of 1. - `report_keys`: `Vector` of keys for the report. Possible keys are: $VALID_KEYS_LIST. By - default only `:raw_glm_model` is excluded. + default only `:glm_model` is excluded. Train the machine using `fit!(mach, rows=...)`. @@ -646,8 +646,8 @@ When all keys are enabled in `report_keys`, the following fields are available i - `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals. -- `raw_glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training - data. +- `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training + data. Refer to the GLM.jl documentation for usage. # Examples @@ -729,7 +729,7 @@ Train the machine using `fit!(mach, rows=...)`. the factor used to update the linear fit. - `report_keys`: `Vector` of keys for the report. Possible keys are: $VALID_KEYS_LIST. By - default only `:raw_glm_model` is excluded. + default only `:glm_model` is excluded. # Operations @@ -765,8 +765,8 @@ The fields of `report(mach)` are: - `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals. -- `raw_glm_model`: The raw fitted model returned by `GLM.glm`. Note this points to training - data. +- `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training + data. Refer to the GLM.jl documentation for usage. # Examples @@ -861,7 +861,7 @@ Train the machine using `fit!(mach, rows=...)`. the factor used to update the linear fit. - `report_keys`: `Vector` of keys for the report. Possible keys are: $VALID_KEYS_LIST. By - default only `:raw_glm_model` is excluded. + default only `:glm_model` is excluded. # Operations @@ -898,8 +898,8 @@ The fields of `report(mach)` are: - `coef_table`: Table which displays coefficients and summarizes their significance and confidence intervals. -- `raw_glm_model`: The raw fitted model returned by `GLM.glm`. Note this points to training - data. +- `glm_model`: The raw fitted model returned by `GLM.lm`. Note this points to training + data. Refer to the GLM.jl documentation for usage. # Examples diff --git a/test/runtests.jl b/test/runtests.jl index b63b363..a266fc6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -326,7 +326,7 @@ end X = (a=[1, 4, 3, 1], b=[2, 0, 1, 4], c=[7, 1, 7, 3]) y = categorical([true, false, true, false]) # check that by default all possible keys are added in the report, - # except raw_glm_model: + # except glm_model: lr = LinearBinaryClassifier() _, _, report = fit(lr, 1, X, y) @test :deviance in keys(report) @@ -334,16 +334,16 @@ end @test :stderror in keys(report) @test :vcov in keys(report) @test :coef_table in keys(report) - @test :raw_glm_model ∉ keys(report) + @test :glm_model ∉ keys(report) # check that report is valid if only some keys are specified - lr = LinearBinaryClassifier(report_keys = [:stderror, :raw_glm_model]) + lr = LinearBinaryClassifier(report_keys = [:stderror, :glm_model]) _, _, report = fit(lr, 1, X, y) @test :deviance ∉ keys(report) @test :stderror in keys(report) @test :dof_residual ∉ keys(report) - @test :raw_glm_model in keys(report) - @test report.raw_glm_model isa GLM.GeneralizedLinearModel + @test :glm_model in keys(report) + @test report.glm_model isa GLM.GeneralizedLinearModel # check that an empty `NamedTuple` is outputed for # `report_params === nothing`