From 7de046f98d528780226672f8f7e7cabe8b2f9448 Mon Sep 17 00:00:00 2001 From: xynady Date: Fri, 13 Oct 2023 16:44:57 +0700 Subject: [PATCH 1/4] Float64 replaced by AbstractFloat for regression --- src/classification/main.jl | 14 +++++--------- src/measures.jl | 6 +++--- src/regression/main.jl | 8 ++++---- src/regression/tree.jl | 14 +++++++------- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/src/classification/main.jl b/src/classification/main.jl index 3c2bdfae..07bf3e1f 100644 --- a/src/classification/main.jl +++ b/src/classification/main.jl @@ -93,7 +93,7 @@ function update_pruned_impurity!( feature_importance::Vector{Float64}, ntt::Int, loss::Function=mean_squared_error, -) where {S,T<:Float64} +) where {S,T<:AbstractFloat} μl = mean(tree.left.values) nl = length(tree.left.values) μr = mean(tree.right.values) @@ -220,7 +220,7 @@ See also [`build_tree`](@ref). function prune_tree( tree::Union{Root{S,T},LeafOrNode{S,T}}, purity_thresh=1.0, - loss::Function=T <: Float64 ? mean_squared_error : util.entropy, + loss::Function=T <: AbstractFloat ? mean_squared_error : util.entropy, ) where {S,T} if purity_thresh >= 1.0 return tree @@ -293,11 +293,7 @@ function apply_tree(tree::LeafOrNode{S,T}, features::AbstractMatrix{S}) where {S for i in 1:N predictions[i] = apply_tree(tree, features[i, :]) end - if T <: Float64 - return Float64.(predictions) - else - return predictions - end + return predictions end """ @@ -343,7 +339,7 @@ end Train a random forest model, built on standard CART decision trees, using the specified `labels` (target) and `features` (patterns). Here: -- `labels` is any `AbstractVector`. If the element type is `Float64`, regression is +- `labels` is any `AbstractVector`. If the element type is `AbstractFloat`, regression is applied, and otherwise classification is applied. - `features` is any `AbstractMatrix{T}` where `T` supports ordering with `<` (unordered @@ -619,7 +615,7 @@ function apply_forest(forest::Ensemble{S,T}, features::AbstractVector{S}) where votes[i] = apply_tree(forest.trees[i], features) end - if T <: Float64 + if T <: AbstractFloat return mean(votes) else return majority_vote(votes) diff --git a/src/measures.jl b/src/measures.jl index f24653cd..6f3b3498 100644 --- a/src/measures.jl +++ b/src/measures.jl @@ -269,7 +269,7 @@ function _nfoldCV( args...; verbose, rng, -) where {T<:Float64} +) where {T<:AbstractFloat} _rng = mk_rng(rng)::Random.AbstractRNG nfolds = args[1] if nfolds < 2 @@ -361,7 +361,7 @@ function nfoldCV_tree( min_purity_increase::Float64=0.0; verbose::Bool=true, rng=Random.GLOBAL_RNG, -) where {S,T<:Float64} +) where {S,T<:AbstractFloat} _nfoldCV( :tree, labels, @@ -389,7 +389,7 @@ function nfoldCV_forest( min_purity_increase::Float64=0.0; verbose::Bool=true, rng=Random.GLOBAL_RNG, -) where {S,T<:Float64} +) where {S,T<:AbstractFloat} _nfoldCV( :forest, labels, diff --git a/src/regression/main.jl b/src/regression/main.jl index 5e4e89f7..1eaf7a4f 100644 --- a/src/regression/main.jl +++ b/src/regression/main.jl @@ -1,6 +1,6 @@ include("tree.jl") -function _convert(node::treeregressor.NodeMeta{S}, labels::Array{T}) where {S,T<:Float64} +function _convert(node::treeregressor.NodeMeta{S}, labels::Array{T}) where {S,T<:AbstractFloat} if node.is_leaf return Leaf{T}(node.label, labels[node.region]) else @@ -27,7 +27,7 @@ function build_stump( features::AbstractMatrix{S}; rng=Random.GLOBAL_RNG, impurity_importance::Bool=true, -) where {S,T<:Float64} +) where {S,T<:AbstractFloat} return build_tree(labels, features, 0, 1; rng, impurity_importance) end @@ -41,7 +41,7 @@ function build_tree( min_purity_increase=0.0; rng=Random.GLOBAL_RNG, impurity_importance::Bool=true, -) where {S,T<:Float64} +) where {S,T<:AbstractFloat} if max_depth == -1 max_depth = typemax(Int) end @@ -85,7 +85,7 @@ function build_forest( min_purity_increase=0.0; rng::Union{Integer,AbstractRNG}=Random.GLOBAL_RNG, impurity_importance::Bool=true, -) where {S,T<:Float64} +) where {S,T<:AbstractFloat} if n_trees < 1 throw("the number of trees must be >= 1") end diff --git a/src/regression/tree.jl b/src/regression/tree.jl index 5a9ae4c9..9fd39339 100644 --- a/src/regression/tree.jl +++ b/src/regression/tree.jl @@ -47,7 +47,7 @@ end # (max_depth, min_samples_split, min_purity_increase) function _split!( X::AbstractMatrix{S}, # the feature array - Y::AbstractVector{Float64}, # the label array + Y::AbstractVector{T}, # the label array W::AbstractVector{U}, node::NodeMeta{S}, # the node to split max_features::Int, # number of features to consider @@ -59,10 +59,10 @@ function _split!( # we split using samples in indX[node.region] # the two arrays below are given for optimization purposes Xf::AbstractVector{S}, - Yf::AbstractVector{Float64}, + Yf::AbstractVector{T}, Wf::AbstractVector{U}, rng::Random.AbstractRNG, -) where {S,U} +) where {S,T<:AbstractFloat,U} region = node.region n_samples = length(region) r_start = region.start - 1 @@ -245,7 +245,7 @@ end function _fit( X::AbstractMatrix{S}, - Y::AbstractVector{Float64}, + Y::AbstractVector{T}, W::AbstractVector{U}, max_features::Int, max_depth::Int, @@ -253,10 +253,10 @@ function _fit( min_samples_split::Int, min_purity_increase::Float64, rng=Random.GLOBAL_RNG::Random.AbstractRNG, -) where {S,U} +) where {S,T<:AbstractFloat,U} n_samples, n_features = size(X) - Yf = Array{Float64}(undef, n_samples) + Yf = Array{T}(undef, n_samples) Xf = Array{S}(undef, n_samples) Wf = Array{U}(undef, n_samples) @@ -293,7 +293,7 @@ end function fit(; X::AbstractMatrix{S}, - Y::AbstractVector{Float64}, + Y::AbstractVector{<:AbstractFloat}, W::Union{Nothing,AbstractVector{U}}, max_features::Int, max_depth::Int, From 385a066f05e020cfdb716846d12793661e5f67ad Mon Sep 17 00:00:00 2001 From: xynady Date: Mon, 16 Oct 2023 15:30:00 +0700 Subject: [PATCH 2/4] test for forest regression with float16 added --- test/regression/low_precision.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/regression/low_precision.jl b/test/regression/low_precision.jl index 115558cd..bdd9ab85 100644 --- a/test/regression/low_precision.jl +++ b/test/regression/low_precision.jl @@ -113,6 +113,7 @@ model = build_forest(labels, features) preds = apply_forest(model, features) @test typeof(preds) == Vector{Float16} + @test !all(x->(x in labels), preds) preds_MT = apply_forest(model, features; use_multithreading=true) @test typeof(preds_MT) == Vector{Float16} From 37a190ec0e430aa019d764f006acc9d59f945ee9 Mon Sep 17 00:00:00 2001 From: Rik Huijzer Date: Mon, 16 Oct 2023 12:02:02 +0200 Subject: [PATCH 3/4] Update test/regression/low_precision.jl --- test/regression/low_precision.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/regression/low_precision.jl b/test/regression/low_precision.jl index bdd9ab85..6e72f8aa 100644 --- a/test/regression/low_precision.jl +++ b/test/regression/low_precision.jl @@ -113,6 +113,8 @@ model = build_forest(labels, features) preds = apply_forest(model, features) @test typeof(preds) == Vector{Float16} + # Verify that the `preds` were calculated based on `labels` of the same type. + # If the code at some point converts the numbers to, say, `Float64`, then this test will fail. @test !all(x->(x in labels), preds) preds_MT = apply_forest(model, features; use_multithreading=true) From 00fd6cb6fd30310daa6e0bf5e7ce93b0c82eb7be Mon Sep 17 00:00:00 2001 From: Rik Huijzer Date: Mon, 16 Oct 2023 12:04:06 +0200 Subject: [PATCH 4/4] Set version to 0.12.4 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 083e562e..d715fb3d 100644 --- a/Project.toml +++ b/Project.toml @@ -2,7 +2,7 @@ name = "DecisionTree" uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" license = "MIT" desc = "Julia implementation of Decision Tree (CART) and Random Forest algorithms" -version = "0.12.3" +version = "0.12.4" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"