Skip to content

Commit

Permalink
Separate Threshold and WeightedThreshold closing #74
Browse files Browse the repository at this point in the history
  • Loading branch information
rofinn committed Oct 21, 2020
1 parent 2de3f7c commit c56c03e
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 36 deletions.
48 changes: 44 additions & 4 deletions src/functional.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ end

const global validation_methods = (
threshold = Threshold,
wthreshold = WeightedThreshold,
)

const global imputation_methods = (
Expand Down Expand Up @@ -90,9 +91,9 @@ filter(f::Function, data; kwargs...) = apply(data, Filter(f); kwargs...)
filter!(f::Function, data; kwargs...) = apply!(data, Filter(f); kwargs...)

@doc """
Impute.threshold(data; ratio=0.1, weights=nothing, kwargs...)
Impute.threshold(data; limit=0.1, kwargs...)
Assert that proportion of missing values in the `data` do not exceed the `ratio`.
Assert that proportion of missing values in the `data` do not exceed the `limit`.
# Examples
```julia-repl
Expand All @@ -110,11 +111,11 @@ julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2,
│ 5 │ 5.0 │ 5.5 │
julia> Impute.threshold(df)
ERROR: ThresholdError: Ratio of missing values exceeded 0.1 (0.4)
ERROR: ThresholdError: Missing data limit exceeded 0.1 (0.4)
Stacktrace:
...
julia> Impute.threshold(df; ratio=0.8)
julia> Impute.threshold(df; limit=0.8)
5×2 DataFrames.DataFrame
│ Row │ a │ b │
│ │ Float64 │ Float64 │
Expand All @@ -128,6 +129,45 @@ julia> Impute.threshold(df; ratio=0.8)
"""
threshold

@doc """
Impute.wthreshold(data; ratio, weights, kwargs...)
Assert that the weighted proportion of missing values in the `data` do not exceed the `limit`.
# Examples
```julia-repl
julia> using DataFrames, Impute
julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5])
5×2 DataFrames.DataFrame
│ Row │ a │ b │
│ │ Float64 │ Float64 │
├─────┼──────────┼──────────┤
│ 1 │ 1.0 │ 1.1 │
│ 2 │ 2.0 │ 2.2 │
│ 3 │ missing │ 3.3 │
│ 4 │ missing │ missing │
│ 5 │ 5.0 │ 5.5 │
julia> Impute.wthreshold(df; limit=0.4, weights=0.1:0.1:0.5)
ERROR: ThresholdError: Missing data limit exceeded 0.4 (0.4666666666666666)
Stacktrace:
...
julia> Impute.wthreshold(df; limit=0.4, weights=0.5:-0.1:0.1)
5×2 DataFrames.DataFrame
│ Row │ a │ b │
│ │ Float64 │ Float64 │
├─────┼──────────┼──────────┤
│ 1 │ 1.0 │ 1.1 │
│ 2 │ 2.0 │ 2.2 │
│ 3 │ missing │ 3.3 │
│ 4 │ missing │ missing │
│ 5 │ 5.0 │ 5.5 │
```
"""
wthreshold

@doc """
Impute.dropobs(data; dims=1)
Expand Down
61 changes: 39 additions & 22 deletions src/validators/threshold.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,61 @@ end
function Base.showerror(io::IO, err::ThresholdError)
println(
io,
"ThresholdError: Ratio of missing values exceeded $(err.limit) ($(err.value))",
"ThresholdError: Missing data limit exceeded $(err.limit) ($(err.value))",
)
end

"""
Threshold(; ratio=0.1, weights=nothing)
Threshold(; limit=0.1)
Assert that the ratio of missing values in the provided dataset does not exceed to specified ratio.
If a weights array is provided then the ratio will be calculated as the
`sum(weights[ismissing.(data)]) / sum(weights)`
Assert that the ratio of missing values in the provided dataset does not exceed to
specified limit.
# Keyword Arguments
* `ratio::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0).
* `weights::AbstractWeights`: A set of statistical weights to use when evaluating the importance
of each observation. If present a weighted ratio of missing values will be calculated.
* `limit::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0).
"""
struct Threshold <: Validator
ratio::Float64
limit::Float64
weights::Union{AbstractWeights, Nothing}
end

Threshold(; ratio=0.1, weights=nothing) = Threshold(ratio, weights)
Threshold(; limit=0.1, weights=nothing) = Threshold(limit, weights)

function _validate(data::AbstractArray{Union{T, Missing}}, t::Threshold) where T
mratio = if t.weights === nothing
count(ismissing, data) / length(data)
else
if size(data) != size(t.weights)
throw(DimensionMismatch(string(
"Input has dimensions $(size(data)), but thresholds weights ",
"has dimensions $(size(t.weights))"
)))
end

sum(t.weights[ismissing.(data)]) / sum(t.weights)
mratio = count(ismissing, data) / length(data)
mratio > t.limit && throw(ThresholdError(t.limit, mratio))
return data
end

"""
WeightedThreshold(; limit, weights)
Assert that the weighted proportion missing values in the provided dataset does not exceed
to specified limit. The weighed proportion is calculated as
`sum(weights[ismissing.(data)]) / sum(weights)`
# Keyword Arguments
* `limit::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0).
* `weights::AbstractWeights`: A set of statistical weights to use when evaluating the importance
of each observation.
"""
struct WeightedThreshold{W <: AbstractArray{<:Real}} <: Validator
limit::Float64
weights::W
end

WeightedThreshold(; limit, weights) = WeightedThreshold(limit, weights)

function _validate(data::AbstractArray{Union{T, Missing}}, wt::WeightedThreshold) where T
if size(data) != size(wt.weights)
throw(DimensionMismatch(string(
"Input has dimensions $(size(data)), but thresholds weights ",
"has dimensions $(size(wt.weights))"
)))
end

mratio > t.ratio && throw(ThresholdError(t.ratio, mratio))
val = sum(wt.weights[ismissing.(data)]) / sum(wt.weights)
val > wt.limit && throw(ThresholdError(wt.limit, val))

return data
end
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ using Impute:
SVD,
Filter,
Threshold,
WeightedThreshold,
ThresholdError,
apply,
impute,
impute!,
interp,
run,
threshold,
wthreshold,
validate


Expand Down
20 changes: 10 additions & 10 deletions test/validators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
table.sin[[2, 3, 7, 12, 19]] .= missing

@testset "Base" begin
t = Threshold(; ratio=0.1)
t = Threshold(; limit=0.1)
@test_throws ThresholdError validate(a, t)
@test_throws ThresholdError validate(m, t)
@test_throws ThresholdError validate(aa, t)
Expand All @@ -34,9 +34,9 @@
sprint(showerror, e)
end

@test msg == "ThresholdError: Ratio of missing values exceeded 0.1 (0.15)\n"
@test msg == "ThresholdError: Missing data limit exceeded 0.1 (0.15)\n"

t = Threshold(; ratio=0.8)
t = Threshold(; limit=0.8)
# Use isequal because we expect the results to contain missings
@test isequal(validate(a, t), a)
@test isequal(validate(m, t), m)
Expand All @@ -47,20 +47,20 @@
@testset "Weighted" begin
# If we use an exponentially weighted context then we won't pass the limit
# because missing earlier observations is less important than later ones.
t = Threshold(; ratio=0.8, weights=eweights(20, 0.3))
t = WeightedThreshold(; limit=0.8, weights=eweights(20, 0.3))
@test isequal(validate(a, t), a)
@test isequal(validate(table, t), table)

@test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), m)
@test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), aa)
@test isequal(wthreshold(m; limit=0.8, weights=eweights(5, 0.3), dims=:cols), m)
@test isequal(wthreshold(m; limit=0.8, weights=eweights(5, 0.3), dims=:cols), aa)

# If we reverse the weights such that earlier observations are more important
# then our previous limit of 0.2 won't be enough to succeed.
t = Threshold(; ratio=0.1, weights=reverse!(eweights(20, 0.3)))
t = WeightedThreshold(; limit=0.1, weights=reverse!(eweights(20, 0.3)))
@test_throws ThresholdError validate(a, t)
@test_throws ThresholdError validate(table, t)

t = Threshold(; ratio=0.1, weights=reverse!(eweights(5, 0.3)))
t = WeightedThreshold(; limit=0.1, weights=reverse!(eweights(5, 0.3)))
@test_throws ThresholdError validate(m, t; dims=:cols)
@test_throws ThresholdError validate(aa, t; dims=:cols)

Expand All @@ -69,8 +69,8 @@
end

@testset "functional" begin
@test_throws ThresholdError Impute.threshold(a; ratio=0.1)
@test_throws ThresholdError Impute.threshold(a; limit=0.1)
# Use isequal because we expect the results to contain missings
@test isequal(Impute.threshold(a; ratio=0.8), a)
@test isequal(Impute.threshold(a; limit=0.8), a)
end
end

0 comments on commit c56c03e

Please sign in to comment.