Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make covariance and correlation work for iterators, skipmissing in particular. #34

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
name = "Statistics"
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. It's a hack to make sure julia knows to load this folder, it's described here for Pkg.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normally the Travis script does that automatically, so you can revert this: https://github.com/JuliaLang/Statistics.jl/blob/master/.travis.yml#L24

Though you need it to run tests locally.


[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
73 changes: 64 additions & 9 deletions src/Statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) =

# covzm (with centered data)

nalimilan marked this conversation as resolved.
Show resolved Hide resolved
covzm(itr::Any; corrected::Bool = true) = covzm(collect(itr); corrected = corrected)
covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected))
function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
C = unscaled_covzm(x, vardim)
Expand All @@ -504,6 +505,8 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
A .= A .* b
return A
end
covzm(x::Any, y::Any; corrected::Bool = true) =
covzm(collect(x), collect(y); corrected = corrected)
covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
unscaled_covzm(x, y) / (length(x) - Int(corrected))
function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true)
Expand All @@ -518,22 +521,32 @@ end
# covm (with provided mean)
## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector}
## which can't be handled by broadcast
covm(itr::Any, itrmean; corrected::Bool=true) =
covm(collect(itr), itrmean; corrected=corrected)
covm(x::AbstractVector, xmean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x); corrected=corrected)
covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, vardim; corrected=corrected)
covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true) =
covzm(x .- xmean, y .- ymean; corrected=corrected)
covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, y .- ymean, vardim; corrected=corrected)

# cov (API)
"""
cov(x::AbstractVector; corrected::Bool=true)
cov(itr::Any; corrected::Bool=true)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved

Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`.
Compute the variance of the iterator `itr`. If `corrected` is `true` (the default) then the sum
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where
``n`` is the number of elements.
"""
function cov(itr::Any; corrected::Bool=true)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want to allow 0 or more than 2 dimensional arrays here?

x = collect(itr)
meanx = mean(x)
covzm(map!(t -> t - meanx, x, x); corrected=corrected)
end
cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected)

"""
Expand All @@ -546,14 +559,24 @@ if `corrected` is `false` where `n = size(X, dims)`.
cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) =
covm(X, _vmean(X, dims), dims; corrected=corrected)


pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
"""
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true)
cov(x::Any, y::Any; corrected::Bool=true)

Compute the covariance between the vectors `x` and `y`. If `corrected` is `true` (the
Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the
default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where
``*`` denotes the complex conjugate and `n = length(x) = length(y)`. If `corrected` is
``*`` denotes the complex conjugate and ``n`` the number of elements. If `corrected` is
`false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``.
"""
function cov(x::Any, y::Any; corrected::Bool=true)
cx = collect(x)
cy = collect(y)
meanx = _vmean(cx, 1)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
meany = _vmean(cy, 1)
dx = x .- meanx
dy = y .- meany
covzm(dx, dy; corrected=corrected)
end
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
covm(x, mean(x), y, mean(y); corrected=corrected)

Expand Down Expand Up @@ -630,7 +653,13 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray)
end

# corzm (non-exported, with centered data)

function corzm(itr::Any)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put this code in an internal method which will be called by all functions that need it? It's repeated three times.

Also:

Suggested change
function corzm(itr::Any)
function corzm(itr::Any)

if Base.IteratorEltype(itr) isa Base.HasEltype && isconcrete(eltype(itr))
return one(real(eltype(itr)))
else
return one(real(eltype(collect(itr))))
end
end
corzm(x::AbstractVector{T}) where {T} = one(real(T))
function corzm(x::AbstractMatrix, vardim::Int=1)
c = unscaled_covzm(x, vardim)
Expand All @@ -644,9 +673,16 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) =
cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt!(sum(abs2, y, dims=vardim)))

# corm

function corm(itr::Any, itrmean)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
if Base.IteratorEltype(itr) isa Base.HasEltype && isconcrete(eltype(itr))
return one(real(eltype(itr)))
else
return one(real(eltype(collect(itr))))
end
end
corm(x::AbstractVector{T}, xmean) where {T} = one(real(T))
corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim)
corm(x::Any, mx, y::Any, my) = corm(collect(x), mx, collect(y), my)
function corm(x::AbstractVector, mx, y::AbstractVector, my)
require_one_based_indexing(x, y)
n = length(x)
Expand Down Expand Up @@ -675,10 +711,17 @@ corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) =

# cor
"""
cor(x::AbstractVector)
cor(itr::Any)

Return the number one.
"""
function cor(itr::Any)
if Base.IteratorEltype(itr) isa Base.HasEltype && isconcrete(eltype(itr))
return one(real(eltype(itr)))
else
return one(real(eltype(collect(itr))))
end
end
cor(x::AbstractVector) = one(real(eltype(x)))

"""
Expand All @@ -688,6 +731,18 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di
"""
cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims)

"""
cor(x::AbstractVector, y::AbstractVector)

Compute the Pearson correlation between the vectors `x` and `y`.
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
"""
function cor(x::Any, y::Any)
cx = collect(x)
cy = collect(y)

corm(cx, _vmean(cx, 1), cy, _vmean(cy, 1))
end

"""
cor(x::AbstractVector, y::AbstractVector)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this docstring which is a special case of the previous one.


Expand Down
49 changes: 40 additions & 9 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -339,11 +339,15 @@ Y = [6.0 2.0;
x1 = vec(X[1,:])
y1 = vec(Y[1,:])
end
x1_itr = (x1i for x1i in x1)
y1_itr = skipmissing(y1)

c = zm ? Statistics.covm(x1, 0, corrected=cr) :
cov(x1, corrected=cr)
c_itr = zm ? Statistics.covm(x1_itr, 0, corrected=cr) :
cov(x1_itr, corrected=cr)
@test isa(c, Float64)
@test c Cxx[1,1]
@test c == c_itr == Cxx[1,1]
@inferred cov(x1, corrected=cr)

@test cov(X) == Statistics.covm(X, mean(X, dims=1))
Expand All @@ -356,21 +360,31 @@ Y = [6.0 2.0;
@test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1))
c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) :
cov(x1, y1, corrected=cr)
c_itr = zm ? Statistics.covm(x1_itr, 0, y1_itr, 0, corrected=cr) :
cov(x1_itr, y1_itr, corrected=cr)
c_itrx = zm ? Statistics.covm(x1_itr, 0, y1, 0, corrected=cr) :
cov(x1_itr, y1, corrected=cr)
c_itry = zm ? Statistics.covm(x1, 0, y1_itr, 0, corrected=cr) :
cov(x1, y1_itr, corrected=cr)
@test isa(c, Float64)
@test c Cxy[1,1]
@test c == c_itr == c_itrx == c_itry == Cxy[1,1]
@inferred cov(x1, y1, corrected=cr)

if vd == 1
@test cov(x1, Y) == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1))
C = cov(x1, Y)
C_itr = cov(x1_itr, Y)
@test C == C_itr == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1))
end
C = zm ? Statistics.covm(x1, 0, Y, 0, vd, corrected=cr) :
cov(x1, Y, dims=vd, corrected=cr)
cov(x1, Y, dims=vd, corrected=cr)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
@test size(C) == (1, k)
@test vec(C) ≈ Cxy[1,:]
@inferred cov(x1, Y, dims=vd, corrected=cr)

if vd == 1
@test cov(X, y1) == Statistics.covm(X, mean(X, dims=1), y1, mean(y1))
C = cov(X, y1)
C_itr = cov(X, y1_itr)
@test C == C_itr == Statistics.covm(X, mean(X, dims=1), y1, mean(y1))
end
C = zm ? Statistics.covm(X, 0, y1, 0, vd, corrected=cr) :
cov(X, y1, dims=vd, corrected=cr)
Expand Down Expand Up @@ -426,10 +440,13 @@ end
x1 = vec(X[1,:])
y1 = vec(Y[1,:])
end
x1_itr = (x1i for x1i in x1)
y1_itr = skipmissing(y1)

c = zm ? Statistics.corm(x1, 0) : cor(x1)
c_itr = zm ? Statistics.corm(x1_itr, 0) : cor(x1_itr)
@test isa(c, Float64)
@test c ≈ Cxx[1,1]
@test c ≈ c_itr ≈ Cxx[1,1]
@inferred cor(x1)

@test cor(X) == Statistics.corm(X, mean(X, dims=1))
Expand All @@ -440,24 +457,38 @@ end

@test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1))
c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1)
c_itr = zm ? Statistics.corm(x1_itr, 0, y1_itr, 0) : cor(x1_itr, y1_itr)
c_itrx = zm ? Statistics.corm(x1_itr, 0, y1, 0) : cor(x1_itr, y1)
c_itry = zm ? Statistics.corm(x1, 0, y1_itr, 0) : cor(x1, y1_itr)

@test isa(c, Float64)
@test c ≈ Cxy[1,1]
@test c == c_itr == c_itrx == c_itry ≈ Cxy[1,1]
@inferred cor(x1, y1)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved

if vd == 1
@test cor(x1, Y) == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1))
C = cor(x1, Y)
C_itr = Statistics.corm(x1_itr, mean(x1), Y, mean(Y, dims=1))
@test C == C_itr == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1))
end
C = zm ? Statistics.corm(x1, 0, Y, 0, vd) : cor(x1, Y, dims=vd)
@test size(C) == (1, k)
@test vec(C) ≈ Cxy[1,:]
@inferred cor(x1, Y, dims=vd)

if vd == 1
@test cor(X, y1) == Statistics.corm(X, mean(X, dims=1), y1, mean(y1))
C = cor(X, y1)
C_itr = cor(X, y1_itr)
@test C == C_itr == Statistics.corm(X, mean(X, dims=1), y1, mean(y1))
end
println("zm = $zm")
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
C = zm ? Statistics.corm(X, 0, y1, 0, vd) : cor(X, y1, dims=vd)

@test size(C) == (k, 1)
@test vec(C) ≈ Cxy[:,1]
if vd == 1
C_itr = zm ? Statistics.corm(X, 0, y1_itr, 0) : cor(X, y1_itr)
@test C_itr == C
end
@inferred cor(X, y1, dims=vd)

@test cor(X, Y) == Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1))
Expand Down