diff --git a/src/Statistics.jl b/src/Statistics.jl index 977ae8a6..e2cb52ba 100644 --- a/src/Statistics.jl +++ b/src/Statistics.jl @@ -479,6 +479,17 @@ end _vmean(x::AbstractVector, vardim::Int) = mean(x) _vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim) +_collect_if_itr(x::Any) = collect(x) +_collect_if_itr(x::AbstractVector) = x + +function _matrix_error(x, y) + if !(x isa AbstractVector || y isa AbstractVector) && (x isa AbstractArray || y isa AbstractArray) + s = "Covariance and correlation between a non-vector array and a non-vector iterator" * + "is currently disallowed. `collect` one of the arguments." + throw(ArgumentError(s)) + end +end + # core functions unscaled_covzm(x::AbstractVector{<:Number}) = sum(abs2, x) @@ -495,6 +506,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) = # covzm (with centered data) +covzm(itr::Any; corrected::Bool = true) = covzm(collect(itr); corrected = corrected) covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected)) function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) C = unscaled_covzm(x, vardim) @@ -504,6 +516,10 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) A .= A .* b return A end +function covzm(x::Any, y::Any; corrected::Bool = true) + _matrix_error(x, y) + covzm(_collect_if_itr(x), _collect_if_itr(y); corrected = corrected) +end covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = unscaled_covzm(x, y) / (length(x) - Int(corrected)) function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true) @@ -518,10 +534,16 @@ end # covm (with provided mean) ## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector} ## which can't be handled by broadcast +covm(itr::Any, itrmean; corrected::Bool=true) = + covm(collect(itr), itrmean; corrected=corrected) covm(x::AbstractVector, xmean; corrected::Bool=true) = covzm(map(t -> t - xmean, x); corrected=corrected) covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) = covzm(x .- xmean, vardim; corrected=corrected) +function covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true) + _matrix_error(x, y) + covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) +end covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) = covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) = @@ -529,11 +551,17 @@ covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corr # cov (API) """ - cov(x::AbstractVector; corrected::Bool=true) + cov(itr::Any; corrected::Bool=true) -Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum -is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. +Compute the variance of the iterator `itr`. If `corrected` is `true` (the default) then the sum +is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where +``n`` is the number of elements. """ +function cov(itr::Any; corrected::Bool=true) + x = collect(itr) + meanx = mean(x) + covzm(map!(t -> t - meanx, x, x); corrected=corrected) +end cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected) """ @@ -547,13 +575,23 @@ cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) = covm(X, _vmean(X, dims), dims; corrected=corrected) """ - cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) + cov(x::Any, y::Any; corrected::Bool=true) -Compute the covariance between the vectors `x` and `y`. If `corrected` is `true` (the +Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where -``*`` denotes the complex conjugate and `n = length(x) = length(y)`. If `corrected` is +``*`` denotes the complex conjugate and ``n`` the number of elements. If `corrected` is `false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``. """ +function cov(x::Any, y::Any; corrected::Bool=true) + _matrix_error(x, y) + cx = collect(x) + cy = collect(y) + meanx = mean(cx) + meany = mean(cy) + dx = map!(t -> t - meanx, cx, cx) + dy = map!(t -> t - meany, cy, cy) + covzm(dx, dy; corrected=corrected) +end cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = covm(x, mean(x), y, mean(y); corrected=corrected) @@ -629,8 +667,17 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray) return C end +function _return_one(itr) + if Base.IteratorEltype(itr) isa Base.HasEltype && isconcrete(eltype(itr)) + return one(real(eltype(itr))) + else + return one(real(eltype(collect(itr)))) + end +end + # corzm (non-exported, with centered data) +corzm(itr::Any) = _return_one(itr) corzm(x::AbstractVector{T}) where {T} = one(real(T)) function corzm(x::AbstractMatrix, vardim::Int=1) c = unscaled_covzm(x, vardim) @@ -645,8 +692,13 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) = # corm +corm(itr::Any, itrmean) = _return_one(itr) corm(x::AbstractVector{T}, xmean) where {T} = one(real(T)) corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim) +function corm(x::Any, mx, y::Any, my) + _matrix_error(x, y) + corm(_collect_if_itr(x), mx, _collect_if_itr(y), my) +end function corm(x::AbstractVector, mx, y::AbstractVector, my) require_one_based_indexing(x, y) n = length(x) @@ -675,10 +727,11 @@ corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) = # cor """ - cor(x::AbstractVector) + cor(itr::Any) Return the number one. """ +cor(itr::Any) = _return_one(itr) cor(x::AbstractVector) = one(real(eltype(x))) """ @@ -688,6 +741,19 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di """ cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims) +""" + cor(x::Any, y::Any) + +Compute the Pearson correlation between iterators `x` and `y`. +""" +function cor(x::Any, y::Any) + _matrix_error(x, y) + cx = _collect_if_itr(x) + cy = _collect_if_itr(y) + + corm(cx, mean(cx), cy, mean(cy)) +end + """ cor(x::AbstractVector, y::AbstractVector) diff --git a/test/runtests.jl b/test/runtests.jl index bc33cf57..19d24288 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -339,12 +339,17 @@ Y = [6.0 2.0; x1 = vec(X[1,:]) y1 = vec(Y[1,:]) end + x1_itr = (x1i for x1i in x1) + y1_itr = skipmissing(y1) c = zm ? Statistics.covm(x1, 0, corrected=cr) : cov(x1, corrected=cr) + c_itr = zm ? Statistics.covm(x1_itr, 0, corrected=cr) : + cov(x1_itr, corrected=cr) @test isa(c, Float64) - @test c ≈ Cxx[1,1] + @test c == c_itr == Cxx[1,1] @inferred cov(x1, corrected=cr) + @inferred cov(x1_itr, corrected=cr) @test cov(X) == Statistics.covm(X, mean(X, dims=1)) C = zm ? Statistics.covm(X, 0, vd, corrected=cr) : @@ -356,9 +361,16 @@ Y = [6.0 2.0; @test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1)) c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) : cov(x1, y1, corrected=cr) + c_itr = zm ? Statistics.covm(x1_itr, 0, y1_itr, 0, corrected=cr) : + cov(x1_itr, y1_itr, corrected=cr) + c_itrx = zm ? Statistics.covm(x1_itr, 0, y1, 0, corrected=cr) : + cov(x1_itr, y1, corrected=cr) + c_itry = zm ? Statistics.covm(x1, 0, y1_itr, 0, corrected=cr) : + cov(x1, y1_itr, corrected=cr) @test isa(c, Float64) - @test c ≈ Cxy[1,1] + @test c == c_itr == c_itrx == c_itry == Cxy[1,1] @inferred cov(x1, y1, corrected=cr) + @inferred cov(x1_itr, y1_itr, corrected=cr) if vd == 1 @test cov(x1, Y) == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1)) @@ -386,6 +398,17 @@ Y = [6.0 2.0; @inferred cov(X, Y, dims=vd, corrected=cr) end + @testset "errors for `cov` with non-array iterators and matrices" begin + x1_itr = (xi for xi in X[:, 1]) + y1_itr = skipmissing(Y[:, 1]) + @test_throws ArgumentError Statistics.covzm(X, y1_itr) + @test_throws ArgumentError Statistics.covzm(x1_itr, Y) + @test_throws ArgumentError Statistics.covm(X, mean(X, dims = 1), y1_itr, mean(y1_itr)) + @test_throws ArgumentError Statistics.covm(x1_itr, mean(x1_itr), Y, mean(Y, dims = 1)) + @test_throws ArgumentError cov(X, y1_itr) + @test_throws ArgumentError cov(x1_itr, Y) + end + @testset "floating point accuracy for `cov` of large numbers" begin A = [4.0, 7.0, 13.0, 16.0] C = A .+ 1.0e10 @@ -426,11 +449,15 @@ end x1 = vec(X[1,:]) y1 = vec(Y[1,:]) end + x1_itr = (x1i for x1i in x1) + y1_itr = skipmissing(y1) c = zm ? Statistics.corm(x1, 0) : cor(x1) + c_itr = zm ? Statistics.corm(x1_itr, 0) : cor(x1_itr) @test isa(c, Float64) - @test c ≈ Cxx[1,1] + @test c ≈ c_itr ≈ Cxx[1,1] @inferred cor(x1) + @inferred cor(x1_itr) @test cor(X) == Statistics.corm(X, mean(X, dims=1)) C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd) @@ -440,9 +467,14 @@ end @test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1)) c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1) + c_itr = zm ? Statistics.corm(x1_itr, 0, y1_itr, 0) : cor(x1_itr, y1_itr) + c_itrx = zm ? Statistics.corm(x1_itr, 0, y1, 0) : cor(x1_itr, y1) + c_itry = zm ? Statistics.corm(x1, 0, y1_itr, 0) : cor(x1, y1_itr) + @test isa(c, Float64) - @test c ≈ Cxy[1,1] + @test c == c_itr == c_itrx == c_itry ≈ Cxy[1,1] @inferred cor(x1, y1) + @inferred cor(x1_itr, y1_itr) if vd == 1 @test cor(x1, Y) == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1)) @@ -477,6 +509,15 @@ end @test cor(tmp, tmp) <= 1.0 @test cor(tmp, tmp2) <= 1.0 end + + @testset "errors for `cor` with non-array iterators and matrices" begin + x1_itr = (xi for xi in X[:, 1]) + y1_itr = skipmissing(Y[:, 1]) + @test_throws ArgumentError Statistics.corm(X, mean(X, dims = 1), y1_itr, mean(y1_itr)) + @test_throws ArgumentError Statistics.corm(x1_itr, mean(x1_itr), Y, mean(Y, dims = 1)) + @test_throws ArgumentError cor(X, y1_itr) + @test_throws ArgumentError cor(x1_itr, Y) + end end @testset "quantile" begin