JuliaStats · pdeffebach · Apr 27, 2020 · Apr 27, 2020 · Apr 28, 2020 · Apr 28, 2020
diff --git a/Project.toml b/Project.toml
@@ -1,5 +1,4 @@
 name = "Statistics"
-uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/src/Statistics.jl b/src/Statistics.jl
@@ -479,6 +479,33 @@ end
 _vmean(x::AbstractVector, vardim::Int) = mean(x)
 _vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim)
 
+_lazycollect(x::Any) = collect(x)
+_lazycollect(x::AbstractVector) = x
+
+function _matrix_error(x, y, fun)
+    if x isa AbstractMatrix
+        s = "$(fun)(x::AbstractMatrix, y::Any) is currently not allowed. " * 
+            "Use $(fun)(x, collect(y)) instead"
+        throw(ArgumentError(s))
+    elseif y isa AbstractMatrix
+        s = "$(fun)(x::Any, y::AbstractMatrix) is currently not allowed. " * 
+            "Use $(fun)(collect(x), y) instead"
+       throw(ArgumentError(s))
+    end
+end
+
+function _matrix_error(x, mx, y, my, fun)
+    if x isa AbstractMatrix || y isa AbstractMatrix
+       s = "$(fun)(x::$(typeof(x)), mx, y::Any, my) is currently not allowed. " * 
+           "Use $(fun)(x, mx, collect(y), my) instead"
+       throw(ArgumentError(s))
+    elseif y isa AbstractMatrix
+       s = "$(fun)(x::Any, mx, y::$(typeof(y)), my) is currently not allowed. " * 
+           "Use $(fun)(collect(x), mx, y, my) inistead."
+       throw(ArgumentError(s))
+    end
+end
+
 # core functions
 
 unscaled_covzm(x::AbstractVector{<:Number})    = sum(abs2, x)
@@ -495,6 +522,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) =
 
 # covzm (with centered data)
 
+covzm(itr::Any; corrected::Bool = true) = covzm(collect(itr); corrected = corrected)
 covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected))
 function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
     C = unscaled_covzm(x, vardim)
@@ -504,6 +532,10 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
     A .= A .* b
     return A
 end
+function covzm(x::Any, y::Any; corrected::Bool = true)
+    _matrix_error(x, y, covzm)
+    covzm(_lazycollect(x), _lazycollect(y); corrected = corrected)
+end
 covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
     unscaled_covzm(x, y) / (length(x) - Int(corrected))
 function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true)
@@ -518,22 +550,34 @@ end
 # covm (with provided mean)
 ## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector}
 ## which can't be handled by broadcast
+covm(itr::Any, itrmean; corrected::Bool=true) = 
+    covm(collect(itr), itrmean; corrected=corrected)
 covm(x::AbstractVector, xmean; corrected::Bool=true) =
     covzm(map(t -> t - xmean, x); corrected=corrected)
 covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) =
     covzm(x .- xmean, vardim; corrected=corrected)
+function covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true)
+    _matrix_error(x, xmean, y, ymean, covm)
+    covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
+end
 covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) =
     covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
 covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) =
     covzm(x .- xmean, y .- ymean, vardim; corrected=corrected)
 
 # cov (API)
 """
-    cov(x::AbstractVector; corrected::Bool=true)
+    cov(itr::Any; corrected::Bool=true)
 
-Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum
-is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`.
+Compute the variance of the iterator `itr`. If `corrected` is `true` (the default) then the sum
+is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where 
+``n`` is the number of elements.
 """
+function cov(itr::Any; corrected::Bool=true)
+    x = collect(itr)
+    meanx = mean(x)
+    covzm(map!(t -> t - meanx, x, x); corrected=corrected)
+end
 cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected)
 
 """
@@ -547,13 +591,23 @@ cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) =
     covm(X, _vmean(X, dims), dims; corrected=corrected)
 
 """
-    cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true)
+    cov(x::Any, y::Any; corrected::Bool=true)
 
-Compute the covariance between the vectors `x` and `y`. If `corrected` is `true` (the
+Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the
 default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where
-``*`` denotes the complex conjugate and `n = length(x) = length(y)`. If `corrected` is
+``*`` denotes the complex conjugate and ``n`` the number of elements. If `corrected` is
 `false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``.
 """
+function cov(x::Any, y::Any; corrected::Bool=true)
+    _matrix_error(x, y, cov)
+    cx = collect(x)
+    cy = collect(y)
+    meanx = _vmean(cx, 1)
+    meany = _vmean(cy, 1)
+    dx = map!(t -> t - meanx, cx, cx)
+    dy = map!(t -> t - meany, cy, cy)
+    covzm(dx, dy; corrected=corrected)
+end
 cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
     covm(x, mean(x), y, mean(y); corrected=corrected)
 
@@ -629,8 +683,19 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray)
     return C
 end
 
+function _return_one(itr)
+    if Base.IteratorEltype(itr) isa Base.HasEltype && isconcrete(eltype(itr))
+        return one(real(eltype(itr)))
+    else
+        return one(real(eltype(collect(itr))))
+    end
+end
+
 # corzm (non-exported, with centered data)
 
+function corzm(itr::Any) 
+    _return_one(itr)
+end
 corzm(x::AbstractVector{T}) where {T} = one(real(T))
 function corzm(x::AbstractMatrix, vardim::Int=1)
     c = unscaled_covzm(x, vardim)
@@ -645,8 +710,15 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) =
 
 # corm
 
+function corm(itr::Any, itrmean) 
+    _return_one(itr)
+end
 corm(x::AbstractVector{T}, xmean) where {T} = one(real(T))
 corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim)
+function corm(x::Any, mx, y::Any, my) 
+    _matrix_error(x, mx, y, my, corm)
+    corm(_lazycollect(x), mx, _lazycollect(y), my)
+end
 function corm(x::AbstractVector, mx, y::AbstractVector, my)
     require_one_based_indexing(x, y)
     n = length(x)
@@ -675,10 +747,13 @@ corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) =
 
 # cor
 """
-    cor(x::AbstractVector)
+    cor(itr::Any)
 
 Return the number one.
 """
+function cor(itr::Any)
+    _return_one(itr)
+end
 cor(x::AbstractVector) = one(real(eltype(x)))
 
 """
@@ -688,6 +763,19 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di
 """
 cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims)
 
+"""
+    cor(x::Any, y::Any)
+
+Compute the Pearson correlation between iterators `x` and `y`.
+"""
+function cor(x::Any, y::Any)
+    _matrix_error(x, y, cor)
+    cx = _lazycollect(x)
+    cy = _lazycollect(y)
+
+    corm(cx, mean(cx), cy, mean(cy))
+end
+
 """
     cor(x::AbstractVector, y::AbstractVector)
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -339,12 +339,17 @@ Y = [6.0  2.0;
             x1 = vec(X[1,:])
             y1 = vec(Y[1,:])
         end
+        x1_itr = (x1i for x1i in x1)
+        y1_itr = skipmissing(y1)
 
         c = zm ? Statistics.covm(x1, 0, corrected=cr) :
                  cov(x1, corrected=cr)
+        c_itr = zm ? Statistics.covm(x1_itr, 0, corrected=cr) :
+                     cov(x1_itr, corrected=cr)
         @test isa(c, Float64)
-        @test c ≈ Cxx[1,1]
+        @test c == c_itr == Cxx[1,1]
         @inferred cov(x1, corrected=cr)
+        @inferred cov(x1_itr, corrected=cr)
 
         @test cov(X) == Statistics.covm(X, mean(X, dims=1))
         C = zm ? Statistics.covm(X, 0, vd, corrected=cr) :
@@ -356,9 +361,16 @@ Y = [6.0  2.0;
         @test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1))
         c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) :
                  cov(x1, y1, corrected=cr)
+        c_itr = zm ? Statistics.covm(x1_itr, 0, y1_itr, 0, corrected=cr) :
+                     cov(x1_itr, y1_itr, corrected=cr)
+        c_itrx = zm ? Statistics.covm(x1_itr, 0, y1, 0, corrected=cr) :
+                     cov(x1_itr, y1, corrected=cr)
+        c_itry = zm ? Statistics.covm(x1, 0, y1_itr, 0, corrected=cr) :
+                     cov(x1, y1_itr, corrected=cr)
         @test isa(c, Float64)
-        @test c ≈ Cxy[1,1]
+        @test c == c_itr == c_itrx == c_itry == Cxy[1,1]
         @inferred cov(x1, y1, corrected=cr)
+        @inferred cov(x1_itr, y1_itr, corrected=cr)
 
         if vd == 1
             @test cov(x1, Y) == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1))
@@ -386,6 +398,17 @@ Y = [6.0  2.0;
         @inferred cov(X, Y, dims=vd, corrected=cr)
     end
 
+    @testset "errors for `cov` with non-array iterators and matrices" begin
+        x1_itr = (xi for xi in X[:, 1])
+        y1_itr = skipmissing(Y[:, 1])
+        @test_throws ArgumentError Statistics.covzm(X, y1_itr)
+        @test_throws ArgumentError Statistics.covzm(x1_itr, Y)
+        @test_throws ArgumentError Statistics.covm(X, mean(X, dims = 1), y1_itr, mean(y1_itr))
+        @test_throws ArgumentError Statistics.covm(x1_itr, mean(x1_itr), Y, mean(Y, dims = 1))        
+        @test_throws ArgumentError cov(X, y1_itr)
+        @test_throws ArgumentError cov(x1_itr, Y)
+    end
+
     @testset "floating point accuracy for `cov` of large numbers" begin
         A = [4.0, 7.0, 13.0, 16.0]
         C = A .+ 1.0e10
@@ -426,11 +449,15 @@ end
             x1 = vec(X[1,:])
             y1 = vec(Y[1,:])
         end
+        x1_itr = (x1i for x1i in x1)
+        y1_itr = skipmissing(y1)
 
         c = zm ? Statistics.corm(x1, 0) : cor(x1)
+        c_itr = zm ? Statistics.corm(x1_itr, 0) : cor(x1_itr)
         @test isa(c, Float64)
-        @test c ≈ Cxx[1,1]
+        @test c ≈ c_itr ≈ Cxx[1,1]
         @inferred cor(x1)
+        @inferred cor(x1_itr)
 
         @test cor(X) == Statistics.corm(X, mean(X, dims=1))
         C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd)
@@ -440,9 +467,14 @@ end
 
         @test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1))
         c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1)
+        c_itr = zm ? Statistics.corm(x1_itr, 0, y1_itr, 0) : cor(x1_itr, y1_itr)
+        c_itrx = zm ? Statistics.corm(x1_itr, 0, y1, 0) : cor(x1_itr, y1)
+        c_itry = zm ? Statistics.corm(x1, 0, y1_itr, 0) : cor(x1, y1_itr)
+
         @test isa(c, Float64)
-        @test c ≈ Cxy[1,1]
+        @test c == c_itr == c_itrx == c_itry ≈ Cxy[1,1]
         @inferred cor(x1, y1)
+        @inferred cor(x1_itr, y1_itr)
 
         if vd == 1
             @test cor(x1, Y) == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1))
@@ -477,6 +509,15 @@ end
         @test cor(tmp, tmp) <= 1.0
         @test cor(tmp, tmp2) <= 1.0
     end
+
+    @testset "errors for `cor` with non-array iterators and matrices" begin
+        x1_itr = (xi for xi in X[:, 1])
+        y1_itr = skipmissing(Y[:, 1])        
+        @test_throws ArgumentError Statistics.corm(X, mean(X, dims = 1), y1_itr, mean(y1_itr))
+        @test_throws ArgumentError Statistics.corm(x1_itr, mean(x1_itr), Y, mean(Y, dims = 1))        
+        @test_throws ArgumentError cor(X, y1_itr)
+        @test_throws ArgumentError cor(x1_itr, Y)
+    end
 end
 
 @testset "quantile" begin