From 668cc433cbde2174c2f9e02150ea06830edc2114 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Wed, 14 Feb 2024 17:20:04 +0100 Subject: [PATCH 1/7] remove matrix data, interface with glm more directly --- src/MLJGLMInterface.jl | 139 ++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 92 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index 7a682e3..1869f31 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -97,20 +97,6 @@ const GLM_MODELS = Union{ ## Helper functions ### -""" - augment_X(X::AbstractMatrix, b::Bool) - -Augment the matrix `X` with a column of ones if the intercept should -be fitted (`b==true`) and return `X` otherwise. -""" -function augment_X(X::AbstractMatrix, b::Bool) - if b - return hcat(X, ones(float(Int), size(X, 1), 1)) - else - return X - end -end - _to_vector(v::Vector) = v _to_vector(v) = collect(v) _to_array(v::AbstractArray) = v @@ -203,38 +189,15 @@ function check_sample_size(model, n, p) return nothing end -function _matrix_and_features(model, Xcols, handle_intercept=false) - col_names = Tables.columnnames(Xcols) - n, p = Tables.rowcount(Xcols), length(col_names) - augment = handle_intercept && model.fit_intercept - - if !handle_intercept # i.e This only runs during `fit` - check_sample_size(model, n, p) - end - - if p == 0 - Xmatrix = Matrix{float(Int)}(undef, n, p) - else - Xmatrix = Tables.matrix(Xcols) - end - - Xmatrix = augment_X(Xmatrix, augment) - - return Xmatrix, col_names -end - -_to_columns(t::Tables.AbstractColumns) = t -_to_columns(t) = Tables.Columns(t) - """ prepare_inputs(model, X; handle_intercept=false) Handle `model.offsetcol` and `model.fit_intercept` if `handle_intercept=true`. `handle_intercept` is disabled for fitting since the StatsModels.@formula handles the intercept. """ -function prepare_inputs(model, X; handle_intercept=false) - Xcols = _to_columns(X) - table_features = Tables.columnnames(Xcols) +function prepare_inputs(model, X) + Xcols = Tables.columntable(X) + table_features = Base.keys(Xcols) p = length(table_features) p >= 1 || throw( ArgumentError("`X` must contain at least one feature column.") @@ -253,9 +216,12 @@ function prepare_inputs(model, X; handle_intercept=false) end end Xminoffset, offset = split_X_offset(Xcols, model.offsetcol) - Xminoffset_cols = _to_columns(Xminoffset) - Xmatrix, features = _matrix_and_features(model, Xminoffset_cols , handle_intercept) - return Xmatrix, offset, _to_array(features) + features = Tables.columnnames(Xminoffset) + + @show first(Xminoffset) + check_sample_size(model, length(first(Xminoffset)), p) + + return Xminoffset, offset, _to_array(features) end """ @@ -285,14 +251,6 @@ function glm_report(glm_model, features, reportkeys) end if :coef_table in reportkeys coef_table = GLM.coeftable(glm_model) - # Update the variable names in the `coef_table` with the actual variable - # names seen during fit. - if length(coef_table.rownms) == length(features) - # This means `fit_intercept` is false - coef_table.rownms = string.(features) - else - coef_table.rownms = [string.(features); "(Intercept)"] - end report_dict[:coef_table] = coef_table end if :glm_model in reportkeys @@ -376,31 +334,31 @@ end dispersion(fr::FitResult) = fr.dispersion params(fr::FitResult) = fr.params + + function MMI.fit(model::LinearRegressor, verbosity::Int, X, y, w=nothing) # apply the model - Xmatrix, offset, features = prepare_inputs(model, X) + X_col_table, offset, features = prepare_inputs(model, X) y_ = isempty(offset) ? y : y .- offset wts = check_weights(w, y_) - data = glm_data(model, Xmatrix, y_, features) + data = merge(X_col_table, (; y = y_)) form = glm_formula(model, features) - fitted_lm = GLM.lm(form, data; model.dropcollinear, wts).model - fitresult = FitResult( - GLM.coef(fitted_lm), GLM.dispersion(fitted_lm), (features = features,) - ) + fitted_lm = GLM.lm(form, data; model.dropcollinear, wts) + # form the report report = glm_report(fitted_lm, features, model.report_keys) cache = nothing # return - return fitresult, cache, report + return fitted_lm, cache, report end function MMI.fit(model::LinearCountRegressor, verbosity::Int, X, y, w=nothing) # apply the model - Xmatrix, offset, features = prepare_inputs(model, X) - data = glm_data(model, Xmatrix, y, features) + X_col_table, offset, features = prepare_inputs(model, X) + data = merge(X_col_table, (; y)) wts = check_weights(w, y) form = glm_formula(model, features) - fitted_glm_frame = GLM.glm( + fitted_glm = GLM.glm( form, data, model.distribution, model.link; offset, model.maxiter, @@ -409,26 +367,23 @@ function MMI.fit(model::LinearCountRegressor, verbosity::Int, X, y, w=nothing) model.minstepfac, wts ) - fitted_glm = fitted_glm_frame.model - fitresult = FitResult( - GLM.coef(fitted_glm), GLM.dispersion(fitted_glm), (features = features,) - ) + # form the report report = glm_report(fitted_glm, features, model.report_keys) cache = nothing # return - return fitresult, cache, report + return fitted_glm, cache, report end function MMI.fit(model::LinearBinaryClassifier, verbosity::Int, X, y, w=nothing) # apply the model - decode = y[1] + decode = MMI.classes(y) y_plain = MMI.int(y) .- 1 # 0, 1 of type Int wts = check_weights(w, y_plain) - Xmatrix, offset, features = prepare_inputs(model, X) - data = glm_data(model, Xmatrix, y_plain, features) + X_col_table, offset, features = prepare_inputs(model, X) + data = merge(X_col_table, (; y = y_plain)) form = glm_formula(model, features) - fitted_glm_frame = GLM.glm( + fitted_glm = GLM.glm( form, data, Bernoulli(), model.link; offset, model.maxiter, @@ -437,27 +392,22 @@ function MMI.fit(model::LinearBinaryClassifier, verbosity::Int, X, y, w=nothing) model.minstepfac, wts ) - fitted_glm = fitted_glm_frame.model - fitresult = FitResult( - GLM.coef(fitted_glm), GLM.dispersion(fitted_glm), (features = features,) - ) + # form the report report = glm_report(fitted_glm, features, model.report_keys) cache = nothing # return - return (fitresult, decode), cache, report + return (fitted_glm, decode), cache, report end glm_fitresult(::LinearRegressor, fitresult) = fitresult glm_fitresult(::LinearCountRegressor, fitresult) = fitresult glm_fitresult(::LinearBinaryClassifier, fitresult) = fitresult[1] -coefs(fr::FitResult) = fr.coefs - function MMI.fitted_params(model::GLM_MODELS, fitresult) result = glm_fitresult(model, fitresult) - coef = coefs(result) - features = copy(params(result).features) + coef = GLM.coef(result) + features = Symbol.(filter(x -> x!="(Intercept)", GLM.coefnames(result))) if model.fit_intercept intercept = coef[end] coef_ = coef[1:end-1] @@ -468,11 +418,9 @@ function MMI.fitted_params(model::GLM_MODELS, fitresult) return (; features, coef=coef_, intercept) end - #### #### PREDICT FUNCTIONS #### - glm_link(model) = model.link glm_link(::LinearRegressor) = GLM.IdentityLink() @@ -493,16 +441,22 @@ function MMI.predict_mean(model::GLM_MODELS, fitresult, Xnew) return glm_predict(link, coef, Xmatrix, model.offsetcol, offset) end -# barrier function to aid performance -function glm_predict(link, coef, Xmatrix, offsetcol, offset) - η = offsetcol === nothing ? (Xmatrix * coef) : (Xmatrix * coef .+ offset) - μ = GLM.linkinv.(link, η) - return μ + +# More efficient fallback. mean is not defined for LinearBinaryClassifier +function MMI.predict_mean(model::LinearRegressor, fitresult, Xnew) + X_col_table, offset, features = prepare_inputs(model, Xnew) + p = GLM.predict(fitresult, X_col_table) + isempty(offset) ? p : p .+ offset +end + +function MMI.predict_mean(model::LinearCountRegressor, fitresult, Xnew) + X_col_table, offset, features = prepare_inputs(model, Xnew) + return GLM.predict(fitresult, X_col_table; offset = offset) end function MMI.predict(model::LinearRegressor, fitresult, Xnew) μ = MMI.predict_mean(model, fitresult, Xnew) - σ̂ = dispersion(fitresult) + σ̂ = GLM.dispersion(fitresult.model) return [GLM.Normal(μᵢ, σ̂) for μᵢ ∈ μ] end @@ -512,8 +466,9 @@ function MMI.predict(model::LinearCountRegressor, fitresult, Xnew) end function MMI.predict(model::LinearBinaryClassifier, (fitresult, decode), Xnew) - π = MMI.predict_mean(model, (fitresult, decode), Xnew) - return MMI.UnivariateFinite(MMI.classes(decode), π, augment=true) + X_col_table, offset, features = prepare_inputs(model, Xnew) + π = GLM.predict(fitresult, X_col_table; offset) + return MMI.UnivariateFinite(decode, π, augment=true) end # NOTE: predict_mode uses MLJBase's fallback @@ -539,7 +494,7 @@ metadata_pkg.( metadata_model( LinearRegressor, - input = Table(Continuous), + input = Table(Continuous, Finite), target = AbstractVector{Continuous}, supports_weights = true, path = "$PKG.LinearRegressor" @@ -547,7 +502,7 @@ metadata_model( metadata_model( LinearBinaryClassifier, - input = Table(Continuous), + input = Table(Continuous, Finite), target = AbstractVector{<:Finite{2}}, supports_weights = true, path = "$PKG.LinearBinaryClassifier" @@ -555,7 +510,7 @@ metadata_model( metadata_model( LinearCountRegressor, - input = Table(Continuous), + input = Table(Continuous, Finite), target = AbstractVector{Count}, supports_weights = true, path = "$PKG.LinearCountRegressor" From 068ba59f5f3c60900c18abf358b1e392adc74b73 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Wed, 14 Feb 2024 17:20:10 +0100 Subject: [PATCH 2/7] tweak the tests --- test/runtests.jl | 52 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index a266fc6..6df6da7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -49,7 +49,7 @@ expit(X) = 1 ./ (1 .+ exp.(-X)) # test `predict` object p_distr = predict(atom_ols, fitresult, selectrows(X, test)) - dispersion = MLJGLMInterface.dispersion(fitresult) + dispersion = GLM.dispersion(fitresult.model) @test p_distr[1] == Normal(p[1], dispersion) # test metadata @@ -103,7 +103,7 @@ end # check predict on `Xnew` with wrong dims Xnew = MLJBase.table(Tables.matrix(X)[:, 1:3], names=Tables.columnnames(X)[1:3]) - @test_throws DimensionMismatch predict(lr, fitresult, Xnew) + @test_throws ErrorException predict(lr, fitresult, Xnew) fitted_params(pr, fitresult) @@ -158,7 +158,7 @@ end Xnew = MLJBase.table( Tables.matrix(XTable)[:, 1:3], names=Tables.columnnames(XTable)[1:3] ) - @test_throws DimensionMismatch predict(lcr, fitresult, Xnew) + @test_throws ErrorException predict(lcr, fitresult, Xnew) # Test metadata model = lcr @@ -194,31 +194,14 @@ modeltypes = [LinearRegressor, LinearBinaryClassifier, LinearCountRegressor] end end - - @testset "intercept/offsetcol" for mt in modeltypes - X = (x1=[1,2,3], x2=[4,5,6]) - m = mt(fit_intercept=true, offsetcol=:x2) - r = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true) - Xmatrix, offset, features = r - @test offset == [4, 5, 6] - @test Xmatrix== [1 1; - 2 1; - 3 1] - @test features == [:x1] - - m1 = mt(fit_intercept=true, offsetcol=:x3) - @test_throws ArgumentError MLJGLMInterface.prepare_inputs(m1, X) - end - + @testset "no intercept/no offsetcol" for mt in modeltypes X = (x1=[1,2,3], x2=[4,5,6]) m = mt(fit_intercept=false) - r = MLJGLMInterface.prepare_inputs(m, X; handle_intercept=true) - Xmatrix, offset, features = r + r = MLJGLMInterface.prepare_inputs(m, X) + Xcols, offset, features = r @test offset == [] - @test Xmatrix == [1 4; - 2 5; - 3 6] + @test Xcols == (x1 = [1, 2, 3], x2 = [4, 5, 6]) @test features == [:x1, :x2] X1 = NamedTuple() @@ -228,10 +211,10 @@ modeltypes = [LinearRegressor, LinearBinaryClassifier, LinearCountRegressor] @testset "offsetcol but no intercept" for mt in modeltypes X = (x1=[1,2,3], x2=[4,5,6]) m = mt(offsetcol=:x1, fit_intercept=false) - Xmatrix, offset, features = MLJGLMInterface.prepare_inputs(m, X) + Xcols, offset, features = MLJGLMInterface.prepare_inputs(m, X) @test offset == [1, 2, 3] - @test Xmatrix == permutedims([4 5 6]) + @test Xcols == (x2 = [4, 5, 6],) @test features == [:x2] # throw error for tables with just one column. @@ -343,7 +326,7 @@ end @test :stderror in keys(report) @test :dof_residual ∉ keys(report) @test :glm_model in keys(report) - @test report.glm_model isa GLM.GeneralizedLinearModel + @test report.glm_model.model isa GLM.GeneralizedLinearModel # check that an empty `NamedTuple` is outputed for # `report_params === nothing` @@ -352,8 +335,21 @@ end @test report === NamedTuple() end +@testset "Categorical predictors" begin + X = (x1=[1.,2.,3.], x2 = categorical([0,1,0])) + y = categorical([false, false, true]) + + mach = machine(LinearBinaryClassifier(), X, y) + fit!(mach) + + fp = fitted_params(mach) + + @test fp.features == Symbol.(["x1", "x2: 0", "x2: 1"]) + @test_throws KeyError predict(mach, (x1 = [2,3,4], x2 = categorical([0,1,2]))) + @test all(isapprox.(pdf.(predict(mach, X), true), [0,0,1], atol = 1e-3)) +end + @testset "Issue 27" begin - @inferred MLJGLMInterface.augment_X(rand(2, 2), false) n, p = (100, 2) Xmat = rand(p, n)' # note using adjoint X = MLJBase.table(Xmat) From 4e05569cc401805dbf5b5f796ff8cf1d2704a5b2 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 15 Feb 2024 10:23:02 +0100 Subject: [PATCH 3/7] add FitResult again --- src/MLJGLMInterface.jl | 75 +++++++++++++++++++++--------------------- test/runtests.jl | 4 +-- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index 1869f31..cad5b59 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -18,7 +18,7 @@ import MLJModelInterface import MLJModelInterface: metadata_pkg, metadata_model, Table, Continuous, Count, Finite, OrderedFactor, Multiclass, @mlj_model using Distributions: Bernoulli, Distribution, Poisson -using StatsModels: ConstantTerm, Term, FormulaTerm, term +using StatsModels: ConstantTerm, Term, FormulaTerm, term, modelcols using Tables import GLM @@ -218,7 +218,6 @@ function prepare_inputs(model, X) Xminoffset, offset = split_X_offset(Xcols, model.offsetcol) features = Tables.columnnames(Xminoffset) - @show first(Xminoffset) check_sample_size(model, length(first(Xminoffset)), p) return Xminoffset, offset, _to_array(features) @@ -322,7 +321,9 @@ end #### FIT FUNCTIONS #### -struct FitResult{V<:AbstractVector, T, R} +struct FitResult{F, V<:AbstractVector, T, R} + "Formula containing all coefficients and their types" + formula::F "Vector containg coeficients of the predictors and intercept" coefs::V "An estimate of the dispersion parameter of the glm model. " @@ -331,10 +332,11 @@ struct FitResult{V<:AbstractVector, T, R} params::R end +FitResult(fitted_glm, features) = FitResult(GLM.formula(fitted_glm), GLM.coef(fitted_glm), GLM.dispersion(fitted_glm.model), (features = features,)) + dispersion(fr::FitResult) = fr.dispersion params(fr::FitResult) = fr.params - - +coefs(fr::FitResult) = fr.coefs function MMI.fit(model::LinearRegressor, verbosity::Int, X, y, w=nothing) # apply the model @@ -345,11 +347,13 @@ function MMI.fit(model::LinearRegressor, verbosity::Int, X, y, w=nothing) form = glm_formula(model, features) fitted_lm = GLM.lm(form, data; model.dropcollinear, wts) + fitresult = FitResult(fitted_lm, features) + # form the report report = glm_report(fitted_lm, features, model.report_keys) cache = nothing # return - return fitted_lm, cache, report + return fitresult, cache, report end function MMI.fit(model::LinearCountRegressor, verbosity::Int, X, y, w=nothing) @@ -368,11 +372,13 @@ function MMI.fit(model::LinearCountRegressor, verbosity::Int, X, y, w=nothing) wts ) + fitresult = FitResult(fitted_glm, features) + # form the report report = glm_report(fitted_glm, features, model.report_keys) cache = nothing # return - return fitted_glm, cache, report + return fitresult, cache, report end function MMI.fit(model::LinearBinaryClassifier, verbosity::Int, X, y, w=nothing) @@ -393,21 +399,24 @@ function MMI.fit(model::LinearBinaryClassifier, verbosity::Int, X, y, w=nothing) wts ) + fitresult = FitResult(fitted_glm, features) + # form the report report = glm_report(fitted_glm, features, model.report_keys) cache = nothing # return - return (fitted_glm, decode), cache, report + return (fitresult, decode), cache, report end glm_fitresult(::LinearRegressor, fitresult) = fitresult glm_fitresult(::LinearCountRegressor, fitresult) = fitresult glm_fitresult(::LinearBinaryClassifier, fitresult) = fitresult[1] + function MMI.fitted_params(model::GLM_MODELS, fitresult) result = glm_fitresult(model, fitresult) - coef = GLM.coef(result) - features = Symbol.(filter(x -> x!="(Intercept)", GLM.coefnames(result))) + coef = coefs(result) + features = copy(params(result).features) if model.fit_intercept intercept = coef[end] coef_ = coef[1:end-1] @@ -424,39 +433,30 @@ end glm_link(model) = model.link glm_link(::LinearRegressor) = GLM.IdentityLink() -# more efficient than MLJBase fallback -function MMI.predict_mean(model::GLM_MODELS, fitresult, Xnew) - Xmatrix, offset, _ = prepare_inputs(model, Xnew; handle_intercept=true) - result = glm_fitresult(model, fitresult) # ::FitResult - coef = coefs(result) - p = size(Xmatrix, 2) - if p != length(coef) - throw( - DimensionMismatch( - "The number of features in training and prediction datasets must be equal" - ) - ) - end - link = glm_link(model) - return glm_predict(link, coef, Xmatrix, model.offsetcol, offset) +function glm_predict(link, terms, coef, offsetcol::Nothing, Xnew) + mm = modelcols(terms, Xnew) + η = mm * coef + μ = GLM.linkinv.(link, η) + return μ end - -# More efficient fallback. mean is not defined for LinearBinaryClassifier -function MMI.predict_mean(model::LinearRegressor, fitresult, Xnew) - X_col_table, offset, features = prepare_inputs(model, Xnew) - p = GLM.predict(fitresult, X_col_table) - isempty(offset) ? p : p .+ offset +function glm_predict(link, terms, coef, offsetcol::Symbol, Xnew) + mm = modelcols(terms, Xnew) + offset = Tables.getcolumn(Xnew, offsetcol) + η = mm * coef .+ offset + μ = GLM.linkinv.(link, η) + return μ end -function MMI.predict_mean(model::LinearCountRegressor, fitresult, Xnew) - X_col_table, offset, features = prepare_inputs(model, Xnew) - return GLM.predict(fitresult, X_col_table; offset = offset) +# More efficient fallback. predict_mean is not defined for LinearBinaryClassifier +function MMI.predict_mean(model::Union{LinearRegressor, LinearCountRegressor}, fitresult, Xnew) + p = glm_predict(glm_link(model), fitresult.formula.rhs, fitresult.coefs, model.offsetcol, Xnew) + return p end function MMI.predict(model::LinearRegressor, fitresult, Xnew) μ = MMI.predict_mean(model, fitresult, Xnew) - σ̂ = GLM.dispersion(fitresult.model) + σ̂ = dispersion(fitresult) return [GLM.Normal(μᵢ, σ̂) for μᵢ ∈ μ] end @@ -466,9 +466,8 @@ function MMI.predict(model::LinearCountRegressor, fitresult, Xnew) end function MMI.predict(model::LinearBinaryClassifier, (fitresult, decode), Xnew) - X_col_table, offset, features = prepare_inputs(model, Xnew) - π = GLM.predict(fitresult, X_col_table; offset) - return MMI.UnivariateFinite(decode, π, augment=true) + p = glm_predict(glm_link(model), fitresult.formula.rhs, fitresult.coefs, model.offsetcol, Xnew) + return MMI.UnivariateFinite(decode, p, augment=true) end # NOTE: predict_mode uses MLJBase's fallback diff --git a/test/runtests.jl b/test/runtests.jl index 6df6da7..bb383f2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -49,7 +49,7 @@ expit(X) = 1 ./ (1 .+ exp.(-X)) # test `predict` object p_distr = predict(atom_ols, fitresult, selectrows(X, test)) - dispersion = GLM.dispersion(fitresult.model) + dispersion = MLJGLMInterface.dispersion(fitresult) @test p_distr[1] == Normal(p[1], dispersion) # test metadata @@ -344,7 +344,7 @@ end fp = fitted_params(mach) - @test fp.features == Symbol.(["x1", "x2: 0", "x2: 1"]) + @test fp.features == [:x1, :x2] @test_throws KeyError predict(mach, (x1 = [2,3,4], x2 = categorical([0,1,2]))) @test all(isapprox.(pdf.(predict(mach, X), true), [0,0,1], atol = 1e-3)) end From 8984846e4ed0123c493d57dead84b2a44f2d590c Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Fri, 16 Feb 2024 13:49:25 +0100 Subject: [PATCH 4/7] intercept as first term --- src/MLJGLMInterface.jl | 18 ++++-------------- test/runtests.jl | 9 ++------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index cad5b59..a30d077 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -259,15 +259,6 @@ function glm_report(glm_model, features, reportkeys) return NamedTuple{Tuple(keys(report_dict))}(values(report_dict)) end -function _ensure_intercept_at_end(form::FormulaTerm) - fixed_rhs = if first(form.rhs) isa ConstantTerm - (form.rhs[2:end]..., form.rhs[1]) - else - form.rhs - end - return FormulaTerm(form.lhs, fixed_rhs) -end - """ glm_formula(model, features::AbstractVector{Symbol}) -> FormulaTerm @@ -278,9 +269,8 @@ function glm_formula(model, features::AbstractVector{Symbol})::FormulaTerm # Adding a zero term explicitly disables the intercept. # See the StatsModels.jl tests for more information. intercept_term = model.fit_intercept ? 1 : 0 - form = FormulaTerm(Term(:y), sum(term.(features)) + term(intercept_term)) - fixed_form = _ensure_intercept_at_end(form) - return fixed_form + form = FormulaTerm(Term(:y), term(intercept_term) + sum(term.(features))) + return form end """ @@ -418,8 +408,8 @@ function MMI.fitted_params(model::GLM_MODELS, fitresult) coef = coefs(result) features = copy(params(result).features) if model.fit_intercept - intercept = coef[end] - coef_ = coef[1:end-1] + intercept = coef[1] + coef_ = coef[2:end] else intercept = zero(eltype(coef)) coef_ = copy(coef) diff --git a/test/runtests.jl b/test/runtests.jl index bb383f2..f5ead7e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -294,8 +294,8 @@ end fitresult, _, report = fit(lr, 1, X, y) ctable = last(report) parameters = ctable.rownms # Row names. - @test parameters == ["a", "b", "c", "(Intercept)"] - intercept = ctable.cols[1][4] + @test parameters == ["(Intercept)", "a", "b", "c"] + intercept = ctable.cols[1][1] yhat = predict(lr, fitresult, X) @test cross_entropy(yhat, y) < 0.6 @@ -359,8 +359,3 @@ end fit(lr, 1, X, y) end -@testset "Issue 34" begin - model = LinearRegressor() - form = MLJGLMInterface.glm_formula(model, [:a, :b]) |> string - @test form == "y ~ a + b + 1" -end From 88da66166bdb81df415573bf82c2f1b2bd36f948 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Fri, 16 Feb 2024 14:03:38 +0100 Subject: [PATCH 5/7] more tests for categorical variables --- test/runtests.jl | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index f5ead7e..22bccbf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -336,17 +336,27 @@ end end @testset "Categorical predictors" begin - X = (x1=[1.,2.,3.], x2 = categorical([0,1,0])) - y = categorical([false, false, true]) + X = (x1=[1.,2.,3.,4.], x2 = categorical([0,1,0,0]), x3 = categorical([1, 0, 1,0], ordered=true)) + y = categorical([false, false, true, true]) mach = machine(LinearBinaryClassifier(), X, y) fit!(mach) - fp = fitted_params(mach) - @test fp.features == [:x1, :x2] - @test_throws KeyError predict(mach, (x1 = [2,3,4], x2 = categorical([0,1,2]))) - @test all(isapprox.(pdf.(predict(mach, X), true), [0,0,1], atol = 1e-3)) + @test fp.features == [:x1, :x2, :x3] + @test_throws KeyError predict(mach, (x1 = [2,3,4], x2 = categorical([0,1,2]), x3 = categorical([1,0,1], ordered=true))) + @test all(isapprox.(pdf.(predict(mach, X), true), [0,0,1,1], atol = 1e-3)) + + # only a categorical variable, with and without intercept + X2 = (; x = X.x2) + y2 = [0., 2., 1., 2.] + fitresult, _, report = fit(LinearRegressor(), 1, X2, y2) + pred = predict_mean(LinearRegressor(), fitresult, X2) + fitresult_nointercept, _, report = fit(LinearRegressor(fit_intercept = false), 1, X2, y2) + + @test all(isapprox.(fitresult.coefs, [1.0, 1.0])) + @test all(isapprox.(fitresult_nointercept.coefs, [1.0, 2.0])) + end @testset "Issue 27" begin From 84b41d4793cfea4eba02b83ef447fb7085012252 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Fri, 16 Feb 2024 14:17:11 +0100 Subject: [PATCH 6/7] remove unused function glm_data --- src/MLJGLMInterface.jl | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/MLJGLMInterface.jl b/src/MLJGLMInterface.jl index a30d077..668097c 100644 --- a/src/MLJGLMInterface.jl +++ b/src/MLJGLMInterface.jl @@ -273,16 +273,6 @@ function glm_formula(model, features::AbstractVector{Symbol})::FormulaTerm return form end -""" - glm_data(model, Xmatrix, y, features) - -Return data which is ready to be passed to `fit(form, data, ...)`. -""" -function glm_data(model, Xmatrix, y, features) - data = Tables.table([Xmatrix y]; header=[features...; :y]) - return data -end - """ check_weights(w, y) From 51d7d52d06bb24f6583c53ced6715914ce8aabe9 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Sun, 25 Feb 2024 14:49:50 +0300 Subject: [PATCH 7/7] add a test for predict with offset --- test/runtests.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 22bccbf..b4b8631 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -229,12 +229,17 @@ end @testset "Test offsetting models" begin @testset "Test split_X_offset" begin - rng = StableRNGs.StableRNG(123) - N = 100 X = (x1=[1,2,3], x2=[4,5,6]) @test MLJGLMInterface.split_X_offset(X, nothing) == (X, Float64[]) @test MLJGLMInterface.split_X_offset(X, :x1) == ((x2=[4,5,6],), [1,2,3]) + lr = LinearRegressor(fit_intercept = false, offsetcol = :x1) + fitresult, _, report = fit(lr, 1, X, [5, 7, 9]) + yhat = predict_mean(lr, fitresult, (x1 = [2, 3, 4], x2 = [5, 6, 7])) + @test yhat == [7.0, 9.0, 11.0] + + rng = StableRNGs.StableRNG(123) + N = 100 X = MLJBase.table(rand(rng, N, 3)) Xnew, offset = MLJGLMInterface.split_X_offset(X, :x2) @test offset isa Vector