-
Notifications
You must be signed in to change notification settings - Fork 42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make covariance and correlation work for iterators, skipmissing in particular. #34
base: master
Are you sure you want to change the base?
Changes from 14 commits
1cdf046
f3e9641
2f9c4f8
52c18ea
4620247
b86ddba
0221557
e3bc3cc
3493ed2
b940ae1
8b49745
2b28908
e42c0b0
cb3020c
36734bf
2f1c404
4279703
b9f8f96
14c5701
11bd8f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -479,6 +479,33 @@ end | |
_vmean(x::AbstractVector, vardim::Int) = mean(x) | ||
_vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim) | ||
|
||
_lazycollect(x::Any) = collect(x) | ||
pdeffebach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
_lazycollect(x::AbstractVector) = x | ||
|
||
function _matrix_error(x, y, fun) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not just throw an error from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I would just add a special method to I think that collecting 2 or more dimensional arrays in places where we expect vectors is not useful (but we can discuss this). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't work because we don't use it all the time, for instance when we I can add a |
||
if x isa AbstractMatrix | ||
s = "$(fun)(x::AbstractMatrix, y::Any) is currently not allowed. " * | ||
"Use $(fun)(x, collect(y)) instead" | ||
throw(ArgumentError(s)) | ||
elseif y isa AbstractMatrix | ||
s = "$(fun)(x::Any, y::AbstractMatrix) is currently not allowed. " * | ||
"Use $(fun)(collect(x), y) instead" | ||
throw(ArgumentError(s)) | ||
end | ||
end | ||
|
||
function _matrix_error(x, mx, y, my, fun) | ||
if x isa AbstractMatrix || y isa AbstractMatrix | ||
s = "$(fun)(x::$(typeof(x)), mx, y::Any, my) is currently not allowed. " * | ||
"Use $(fun)(x, mx, collect(y), my) instead" | ||
throw(ArgumentError(s)) | ||
elseif y isa AbstractMatrix | ||
s = "$(fun)(x::Any, mx, y::$(typeof(y)), my) is currently not allowed. " * | ||
"Use $(fun)(collect(x), mx, y, my) inistead." | ||
throw(ArgumentError(s)) | ||
end | ||
end | ||
|
||
# core functions | ||
|
||
unscaled_covzm(x::AbstractVector{<:Number}) = sum(abs2, x) | ||
|
@@ -495,6 +522,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) = | |
|
||
# covzm (with centered data) | ||
|
||
nalimilan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
covzm(itr::Any; corrected::Bool = true) = covzm(collect(itr); corrected = corrected) | ||
covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected)) | ||
function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) | ||
C = unscaled_covzm(x, vardim) | ||
|
@@ -504,6 +532,10 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) | |
A .= A .* b | ||
return A | ||
end | ||
function covzm(x::Any, y::Any; corrected::Bool = true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In what case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same question applies to |
||
_matrix_error(x, y, covzm) | ||
covzm(_lazycollect(x), _lazycollect(y); corrected = corrected) | ||
end | ||
covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = | ||
unscaled_covzm(x, y) / (length(x) - Int(corrected)) | ||
function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true) | ||
|
@@ -518,22 +550,34 @@ end | |
# covm (with provided mean) | ||
## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector} | ||
## which can't be handled by broadcast | ||
covm(itr::Any, itrmean; corrected::Bool=true) = | ||
covm(collect(itr), itrmean; corrected=corrected) | ||
covm(x::AbstractVector, xmean; corrected::Bool=true) = | ||
covzm(map(t -> t - xmean, x); corrected=corrected) | ||
covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) = | ||
covzm(x .- xmean, vardim; corrected=corrected) | ||
function covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true) | ||
_matrix_error(x, xmean, y, ymean, covm) | ||
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) | ||
end | ||
covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) = | ||
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) | ||
covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) = | ||
covzm(x .- xmean, y .- ymean, vardim; corrected=corrected) | ||
|
||
# cov (API) | ||
""" | ||
cov(x::AbstractVector; corrected::Bool=true) | ||
cov(itr::Any; corrected::Bool=true) | ||
pdeffebach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum | ||
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. | ||
Compute the variance of the iterator `itr`. If `corrected` is `true` (the default) then the sum | ||
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where | ||
``n`` is the number of elements. | ||
""" | ||
function cov(itr::Any; corrected::Bool=true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we want to allow 0 or more than 2 dimensional arrays here? |
||
x = collect(itr) | ||
meanx = mean(x) | ||
covzm(map!(t -> t - meanx, x, x); corrected=corrected) | ||
end | ||
cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected) | ||
|
||
""" | ||
|
@@ -547,13 +591,23 @@ cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) = | |
covm(X, _vmean(X, dims), dims; corrected=corrected) | ||
|
||
""" | ||
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) | ||
cov(x::Any, y::Any; corrected::Bool=true) | ||
|
||
Compute the covariance between the vectors `x` and `y`. If `corrected` is `true` (the | ||
Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the | ||
default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where | ||
``*`` denotes the complex conjugate and `n = length(x) = length(y)`. If `corrected` is | ||
``*`` denotes the complex conjugate and ``n`` the number of elements. If `corrected` is | ||
`false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``. | ||
""" | ||
function cov(x::Any, y::Any; corrected::Bool=true) | ||
_matrix_error(x, y, cov) | ||
cx = collect(x) | ||
cy = collect(y) | ||
meanx = _vmean(cx, 1) | ||
pdeffebach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
meany = _vmean(cy, 1) | ||
dx = map!(t -> t - meanx, cx, cx) | ||
dy = map!(t -> t - meany, cy, cy) | ||
covzm(dx, dy; corrected=corrected) | ||
end | ||
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = | ||
covm(x, mean(x), y, mean(y); corrected=corrected) | ||
|
||
|
@@ -629,8 +683,19 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray) | |
return C | ||
end | ||
|
||
function _return_one(itr) | ||
if Base.IteratorEltype(itr) isa Base.HasEltype && isconcrete(eltype(itr)) | ||
return one(real(eltype(itr))) | ||
else | ||
return one(real(eltype(collect(itr)))) | ||
end | ||
end | ||
|
||
# corzm (non-exported, with centered data) | ||
|
||
function corzm(itr::Any) | ||
_return_one(itr) | ||
end | ||
pdeffebach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
corzm(x::AbstractVector{T}) where {T} = one(real(T)) | ||
function corzm(x::AbstractMatrix, vardim::Int=1) | ||
c = unscaled_covzm(x, vardim) | ||
|
@@ -645,8 +710,15 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) = | |
|
||
# corm | ||
|
||
function corm(itr::Any, itrmean) | ||
pdeffebach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
_return_one(itr) | ||
end | ||
pdeffebach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
corm(x::AbstractVector{T}, xmean) where {T} = one(real(T)) | ||
corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim) | ||
function corm(x::Any, mx, y::Any, my) | ||
_matrix_error(x, mx, y, my, corm) | ||
corm(_lazycollect(x), mx, _lazycollect(y), my) | ||
end | ||
function corm(x::AbstractVector, mx, y::AbstractVector, my) | ||
require_one_based_indexing(x, y) | ||
n = length(x) | ||
|
@@ -675,10 +747,13 @@ corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) = | |
|
||
# cor | ||
""" | ||
cor(x::AbstractVector) | ||
cor(itr::Any) | ||
|
||
Return the number one. | ||
""" | ||
function cor(itr::Any) | ||
_return_one(itr) | ||
end | ||
pdeffebach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cor(x::AbstractVector) = one(real(eltype(x))) | ||
|
||
""" | ||
|
@@ -688,6 +763,19 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di | |
""" | ||
cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims) | ||
|
||
""" | ||
cor(x::Any, y::Any) | ||
|
||
Compute the Pearson correlation between iterators `x` and `y`. | ||
""" | ||
function cor(x::Any, y::Any) | ||
_matrix_error(x, y, cor) | ||
cx = _lazycollect(x) | ||
cy = _lazycollect(y) | ||
|
||
corm(cx, mean(cx), cy, mean(cy)) | ||
end | ||
|
||
""" | ||
cor(x::AbstractVector, y::AbstractVector) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this docstring which is a special case of the previous one. |
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intentional?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. It's a hack to make sure julia knows to load this folder, it's described here for Pkg.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Normally the Travis script does that automatically, so you can revert this: https://github.com/JuliaLang/Statistics.jl/blob/master/.travis.yml#L24
Though you need it to run tests locally.