From c56c03e5b79b093c184bf7e74a55e794276c54fc Mon Sep 17 00:00:00 2001
From: rofinn <rory.finnegan@gmail.com>
Date: Wed, 21 Oct 2020 13:01:57 -0500
Subject: [PATCH] Separate Threshold and WeightedThreshold closing #74

---
 src/functional.jl           | 48 ++++++++++++++++++++++++++---
 src/validators/threshold.jl | 61 ++++++++++++++++++++++++-------------
 test/runtests.jl            |  2 ++
 test/validators.jl          | 20 ++++++------
 4 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/src/functional.jl b/src/functional.jl
index c8d0018..45e8f9a 100644
--- a/src/functional.jl
+++ b/src/functional.jl
@@ -37,6 +37,7 @@ end
 
 const global validation_methods = (
     threshold = Threshold,
+    wthreshold = WeightedThreshold,
 )
 
 const global imputation_methods = (
@@ -90,9 +91,9 @@ filter(f::Function, data; kwargs...) = apply(data, Filter(f); kwargs...)
 filter!(f::Function, data; kwargs...) = apply!(data, Filter(f); kwargs...)
 
 @doc """
-    Impute.threshold(data; ratio=0.1, weights=nothing, kwargs...)
+    Impute.threshold(data; limit=0.1, kwargs...)
 
-Assert that proportion of missing values in the `data` do not exceed the `ratio`.
+Assert that proportion of missing values in the `data` do not exceed the `limit`.
 
 # Examples
 ```julia-repl
@@ -110,11 +111,11 @@ julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2,
 │ 5   │ 5.0      │ 5.5      │
 
 julia> Impute.threshold(df)
-ERROR: ThresholdError: Ratio of missing values exceeded 0.1 (0.4)
+ERROR: ThresholdError: Missing data limit exceeded 0.1 (0.4)
 Stacktrace:
 ...
 
-julia> Impute.threshold(df; ratio=0.8)
+julia> Impute.threshold(df; limit=0.8)
 5×2 DataFrames.DataFrame
 │ Row │ a        │ b        │
 │     │ Float64  │ Float64  │
@@ -128,6 +129,45 @@ julia> Impute.threshold(df; ratio=0.8)
 """
 threshold
 
+@doc """
+    Impute.wthreshold(data; ratio, weights, kwargs...)
+
+Assert that the weighted proportion of missing values in the `data` do not exceed the `limit`.
+
+# Examples
+```julia-repl
+julia> using DataFrames, Impute
+
+julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5])
+5×2 DataFrames.DataFrame
+│ Row │ a        │ b        │
+│     │ Float64  │ Float64  │
+├─────┼──────────┼──────────┤
+│ 1   │ 1.0      │ 1.1      │
+│ 2   │ 2.0      │ 2.2      │
+│ 3   │ missing  │ 3.3      │
+│ 4   │ missing  │ missing  │
+│ 5   │ 5.0      │ 5.5      │
+
+julia> Impute.wthreshold(df; limit=0.4, weights=0.1:0.1:0.5)
+ERROR: ThresholdError: Missing data limit exceeded 0.4 (0.4666666666666666)
+Stacktrace:
+...
+
+julia> Impute.wthreshold(df; limit=0.4, weights=0.5:-0.1:0.1)
+5×2 DataFrames.DataFrame
+│ Row │ a        │ b        │
+│     │ Float64  │ Float64  │
+├─────┼──────────┼──────────┤
+│ 1   │ 1.0      │ 1.1      │
+│ 2   │ 2.0      │ 2.2      │
+│ 3   │ missing  │ 3.3      │
+│ 4   │ missing  │ missing  │
+│ 5   │ 5.0      │ 5.5      │
+```
+"""
+wthreshold
+
 @doc """
     Impute.dropobs(data; dims=1)
 
diff --git a/src/validators/threshold.jl b/src/validators/threshold.jl
index 10937dc..d3cd56c 100644
--- a/src/validators/threshold.jl
+++ b/src/validators/threshold.jl
@@ -15,44 +15,61 @@ end
 function Base.showerror(io::IO, err::ThresholdError)
     println(
         io,
-        "ThresholdError: Ratio of missing values exceeded $(err.limit) ($(err.value))",
+        "ThresholdError: Missing data limit exceeded $(err.limit) ($(err.value))",
     )
 end
 
 """
-    Threshold(; ratio=0.1, weights=nothing)
+    Threshold(; limit=0.1)
 
-Assert that the ratio of missing values in the provided dataset does not exceed to specified ratio.
-If a weights array is provided then the ratio will be calculated as the
-`sum(weights[ismissing.(data)]) / sum(weights)`
+Assert that the ratio of missing values in the provided dataset does not exceed to
+specified limit.
 
 # Keyword Arguments
-* `ratio::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0).
-* `weights::AbstractWeights`: A set of statistical weights to use when evaluating the importance
-  of each observation. If present a weighted ratio of missing values will be calculated.
+* `limit::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0).
 """
 struct Threshold <: Validator
-    ratio::Float64
+    limit::Float64
     weights::Union{AbstractWeights, Nothing}
 end
 
-Threshold(; ratio=0.1, weights=nothing) = Threshold(ratio, weights)
+Threshold(; limit=0.1, weights=nothing) = Threshold(limit, weights)
 
 function _validate(data::AbstractArray{Union{T, Missing}}, t::Threshold) where T
-    mratio = if t.weights === nothing
-        count(ismissing, data) / length(data)
-    else
-        if size(data) != size(t.weights)
-            throw(DimensionMismatch(string(
-                "Input has dimensions $(size(data)), but thresholds weights ",
-                "has dimensions $(size(t.weights))"
-            )))
-        end
-
-        sum(t.weights[ismissing.(data)]) / sum(t.weights)
+    mratio = count(ismissing, data) / length(data)
+    mratio > t.limit && throw(ThresholdError(t.limit, mratio))
+    return data
+end
+
+"""
+    WeightedThreshold(; limit, weights)
+
+Assert that the weighted proportion missing values in the provided dataset does not exceed
+to specified limit. The weighed proportion is calculated as
+`sum(weights[ismissing.(data)]) / sum(weights)`
+
+# Keyword Arguments
+* `limit::Real`: Allowed proportion of missing values (should be between 0.0 and 1.0).
+* `weights::AbstractWeights`: A set of statistical weights to use when evaluating the importance
+  of each observation.
+"""
+struct WeightedThreshold{W <: AbstractArray{<:Real}} <: Validator
+    limit::Float64
+    weights::W
+end
+
+WeightedThreshold(; limit, weights) = WeightedThreshold(limit, weights)
+
+function _validate(data::AbstractArray{Union{T, Missing}}, wt::WeightedThreshold) where T
+    if size(data) != size(wt.weights)
+        throw(DimensionMismatch(string(
+            "Input has dimensions $(size(data)), but thresholds weights ",
+            "has dimensions $(size(wt.weights))"
+        )))
     end
 
-    mratio > t.ratio && throw(ThresholdError(t.ratio, mratio))
+    val = sum(wt.weights[ismissing.(data)]) / sum(wt.weights)
+    val > wt.limit && throw(ThresholdError(wt.limit, val))
 
     return data
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index b54b70f..d83b44e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -34,6 +34,7 @@ using Impute:
     SVD,
     Filter,
     Threshold,
+    WeightedThreshold,
     ThresholdError,
     apply,
     impute,
@@ -41,6 +42,7 @@ using Impute:
     interp,
     run,
     threshold,
+    wthreshold,
     validate
 
 
diff --git a/test/validators.jl b/test/validators.jl
index ff25d7f..03ac8af 100644
--- a/test/validators.jl
+++ b/test/validators.jl
@@ -21,7 +21,7 @@
     table.sin[[2, 3, 7, 12, 19]] .= missing
 
     @testset "Base" begin
-        t = Threshold(; ratio=0.1)
+        t = Threshold(; limit=0.1)
         @test_throws ThresholdError validate(a, t)
         @test_throws ThresholdError validate(m, t)
         @test_throws ThresholdError validate(aa, t)
@@ -34,9 +34,9 @@
             sprint(showerror, e)
         end
 
-        @test msg == "ThresholdError: Ratio of missing values exceeded 0.1 (0.15)\n"
+        @test msg == "ThresholdError: Missing data limit exceeded 0.1 (0.15)\n"
 
-        t = Threshold(; ratio=0.8)
+        t = Threshold(; limit=0.8)
         # Use isequal because we expect the results to contain missings
         @test isequal(validate(a, t), a)
         @test isequal(validate(m, t), m)
@@ -47,20 +47,20 @@
     @testset "Weighted" begin
         # If we use an exponentially weighted context then we won't pass the limit
         # because missing earlier observations is less important than later ones.
-        t = Threshold(; ratio=0.8, weights=eweights(20, 0.3))
+        t = WeightedThreshold(; limit=0.8, weights=eweights(20, 0.3))
         @test isequal(validate(a, t), a)
         @test isequal(validate(table, t), table)
 
-        @test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), m)
-        @test isequal(threshold(m; ratio=0.8, weights=eweights(5, 0.3), dims=:cols), aa)
+        @test isequal(wthreshold(m; limit=0.8, weights=eweights(5, 0.3), dims=:cols), m)
+        @test isequal(wthreshold(m; limit=0.8, weights=eweights(5, 0.3), dims=:cols), aa)
 
         # If we reverse the weights such that earlier observations are more important
         # then our previous limit of 0.2 won't be enough to succeed.
-        t = Threshold(; ratio=0.1, weights=reverse!(eweights(20, 0.3)))
+        t = WeightedThreshold(; limit=0.1, weights=reverse!(eweights(20, 0.3)))
         @test_throws ThresholdError validate(a, t)
         @test_throws ThresholdError validate(table, t)
 
-        t = Threshold(; ratio=0.1, weights=reverse!(eweights(5, 0.3)))
+        t = WeightedThreshold(; limit=0.1, weights=reverse!(eweights(5, 0.3)))
         @test_throws ThresholdError validate(m, t; dims=:cols)
         @test_throws ThresholdError validate(aa, t; dims=:cols)
 
@@ -69,8 +69,8 @@
     end
 
     @testset "functional" begin
-        @test_throws ThresholdError Impute.threshold(a; ratio=0.1)
+        @test_throws ThresholdError Impute.threshold(a; limit=0.1)
         # Use isequal because we expect the results to contain missings
-        @test isequal(Impute.threshold(a; ratio=0.8), a)
+        @test isequal(Impute.threshold(a; limit=0.8), a)
     end
 end