From c56c03e5b79b093c184bf7e74a55e794276c54fc Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 21 Oct 2020 13:01:57 -0500 Subject: [PATCH] Separate Threshold and WeightedThreshold closing #74 --- src/functional.jl | 48 ++++++++++++++++++++++++++--- src/validators/threshold.jl | 61 ++++++++++++++++++++++++------------- test/runtests.jl | 2 ++ test/validators.jl | 20 ++++++------ 4 files changed, 95 insertions(+), 36 deletions(-) diff --git a/src/functional.jl b/src/functional.jl index c8d0018..45e8f9a 100644 --- a/src/functional.jl +++ b/src/functional.jl @@ -37,6 +37,7 @@ end const global validation_methods = ( threshold = Threshold, + wthreshold = WeightedThreshold, ) const global imputation_methods = ( @@ -90,9 +91,9 @@ filter(f::Function, data; kwargs...) = apply(data, Filter(f); kwargs...) filter!(f::Function, data; kwargs...) = apply!(data, Filter(f); kwargs...) @doc """ - Impute.threshold(data; ratio=0.1, weights=nothing, kwargs...) + Impute.threshold(data; limit=0.1, kwargs...) -Assert that proportion of missing values in the `data` do not exceed the `ratio`. +Assert that proportion of missing values in the `data` do not exceed the `limit`. # Examples ```julia-repl @@ -110,11 +111,11 @@ julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, │ 5 │ 5.0 │ 5.5 │ julia> Impute.threshold(df) -ERROR: ThresholdError: Ratio of missing values exceeded 0.1 (0.4) +ERROR: ThresholdError: Missing data limit exceeded 0.1 (0.4) Stacktrace: ... -julia> Impute.threshold(df; ratio=0.8) +julia> Impute.threshold(df; limit=0.8) 5×2 DataFrames.DataFrame │ Row │ a │ b │ │ │ Float64 │ Float64 │ @@ -128,6 +129,45 @@ julia> Impute.threshold(df; ratio=0.8) """ threshold +@doc """ + Impute.wthreshold(data; ratio, weights, kwargs...) + +Assert that the weighted proportion of missing values in the `data` do not exceed the `limit`. + +# Examples +```julia-repl +julia> using DataFrames, Impute + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.wthreshold(df; limit=0.4, weights=0.1:0.1:0.5) +ERROR: ThresholdError: Missing data limit exceeded 0.4 (0.4666666666666666) +Stacktrace: +... + +julia> Impute.wthreshold(df; limit=0.4, weights=0.5:-0.1:0.1) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" +wthreshold + @doc """ Impute.dropobs(data; dims=1) diff --git a/src/validators/threshold.jl b/src/validators/threshold.jl index 10937dc..d3cd56c 100644 --- a/src/validators/threshold.jl +++ b/src/validators/threshold.jl @@ -15,44 +15,61 @@ end function Base.showerror(io::IO, err::ThresholdError) println( io, - "ThresholdError: Ratio of missing values exceeded $(err.limit) ($(err.value))", + "ThresholdError: Missing data limit exceeded $(err.limit) ($(err.value))", ) end """ - Threshold(; ratio=0.1, weights=nothing) + Threshold(; limit=0.1) -Assert that the ratio of missing values in the provided dataset does not exceed to specified ratio. -If a weights array is provided then the ratio will be calculated as the -`sum(weights[ismissing.(data)]) / sum(weights)` +Assert that the ratio of missing values in the provided dataset does not exceed to +specified limit. # Keyword Arguments -* `ratio::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0). -* `weights::AbstractWeights`: A set of statistical weights to use when evaluating the importance - of each observation. If present a weighted ratio of missing values will be calculated. +* `limit::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0). """ struct Threshold <: Validator - ratio::Float64 + limit::Float64 weights::Union{AbstractWeights, Nothing} end -Threshold(; ratio=0.1, weights=nothing) = Threshold(ratio, weights) +Threshold(; limit=0.1, weights=nothing) = Threshold(limit, weights) function _validate(data::AbstractArray{Union{T, Missing}}, t::Threshold) where T - mratio = if t.weights === nothing - count(ismissing, data) / length(data) - else - if size(data) != size(t.weights) - throw(DimensionMismatch(string( - "Input has dimensions $(size(data)), but thresholds weights ", - "has dimensions $(size(t.weights))" - ))) - end - - sum(t.weights[ismissing.(data)]) / sum(t.weights) + mratio = count(ismissing, data) / length(data) + mratio > t.limit && throw(ThresholdError(t.limit, mratio)) + return data +end + +""" + WeightedThreshold(; limit, weights) + +Assert that the weighted proportion missing values in the provided dataset does not exceed +to specified limit. The weighed proportion is calculated as +`sum(weights[ismissing.(data)]) / sum(weights)` + +# Keyword Arguments +* `limit::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0). +* `weights::AbstractWeights`: A set of statistical weights to use when evaluating the importance + of each observation. +""" +struct WeightedThreshold{W <: AbstractArray{<:Real}} <: Validator + limit::Float64 + weights::W +end + +WeightedThreshold(; limit, weights) = WeightedThreshold(limit, weights) + +function _validate(data::AbstractArray{Union{T, Missing}}, wt::WeightedThreshold) where T + if size(data) != size(wt.weights) + throw(DimensionMismatch(string( + "Input has dimensions $(size(data)), but thresholds weights ", + "has dimensions $(size(wt.weights))" + ))) end - mratio > t.ratio && throw(ThresholdError(t.ratio, mratio)) + val = sum(wt.weights[ismissing.(data)]) / sum(wt.weights) + val > wt.limit && throw(ThresholdError(wt.limit, val)) return data end diff --git a/test/runtests.jl b/test/runtests.jl index b54b70f..d83b44e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -34,6 +34,7 @@ using Impute: SVD, Filter, Threshold, + WeightedThreshold, ThresholdError, apply, impute, @@ -41,6 +42,7 @@ using Impute: interp, run, threshold, + wthreshold, validate diff --git a/test/validators.jl b/test/validators.jl index ff25d7f..03ac8af 100644 --- a/test/validators.jl +++ b/test/validators.jl @@ -21,7 +21,7 @@ table.sin[[2, 3, 7, 12, 19]] .= missing @testset "Base" begin - t = Threshold(; ratio=0.1) + t = Threshold(; limit=0.1) @test_throws ThresholdError validate(a, t) @test_throws ThresholdError validate(m, t) @test_throws ThresholdError validate(aa, t) @@ -34,9 +34,9 @@ sprint(showerror, e) end - @test msg == "ThresholdError: Ratio of missing values exceeded 0.1 (0.15)\n" + @test msg == "ThresholdError: Missing data limit exceeded 0.1 (0.15)\n" - t = Threshold(; ratio=0.8) + t = Threshold(; limit=0.8) # Use isequal because we expect the results to contain missings @test isequal(validate(a, t), a) @test isequal(validate(m, t), m) @@ -47,20 +47,20 @@ @testset "Weighted" begin # If we use an exponentially weighted context then we won't pass the limit # because missing earlier observations is less important than later ones. - t = Threshold(; ratio=0.8, weights=eweights(20, 0.3)) + t = WeightedThreshold(; limit=0.8, weights=eweights(20, 0.3)) @test isequal(validate(a, t), a) @test isequal(validate(table, t), table) - @test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), m) - @test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), aa) + @test isequal(wthreshold(m; limit=0.8, weights=eweights(5, 0.3), dims=:cols), m) + @test isequal(wthreshold(m; limit=0.8, weights=eweights(5, 0.3), dims=:cols), aa) # If we reverse the weights such that earlier observations are more important # then our previous limit of 0.2 won't be enough to succeed. - t = Threshold(; ratio=0.1, weights=reverse!(eweights(20, 0.3))) + t = WeightedThreshold(; limit=0.1, weights=reverse!(eweights(20, 0.3))) @test_throws ThresholdError validate(a, t) @test_throws ThresholdError validate(table, t) - t = Threshold(; ratio=0.1, weights=reverse!(eweights(5, 0.3))) + t = WeightedThreshold(; limit=0.1, weights=reverse!(eweights(5, 0.3))) @test_throws ThresholdError validate(m, t; dims=:cols) @test_throws ThresholdError validate(aa, t; dims=:cols) @@ -69,8 +69,8 @@ end @testset "functional" begin - @test_throws ThresholdError Impute.threshold(a; ratio=0.1) + @test_throws ThresholdError Impute.threshold(a; limit=0.1) # Use isequal because we expect the results to contain missings - @test isequal(Impute.threshold(a; ratio=0.8), a) + @test isequal(Impute.threshold(a; limit=0.8), a) end end