diff --git a/docs/make.jl b/docs/make.jl index 684bd93..fb97142 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -8,7 +8,7 @@ makedocs( sitename = "MultivariateStats.jl", modules = [MultivariateStats], pages = ["Home"=>"index.md", - "whiten.md", "lda.md", "pca.md", + "whiten.md", "lda.md", "pca.md", "mds.md", "Development"=>"api.md"] ) diff --git a/docs/src/api.md b/docs/src/api.md index 56f2c40..e607d68 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -8,7 +8,7 @@ Table of the package models and corresponding function names used by these model |------------------|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| |fit | x | x | x | x | x | x | x | x | x | |transform | x | x | x | x | x | x | x | x | x | -|predict | | | | x | | | | | | +|predict | | | | x | | | | | | |indim | | x | x | x | x | x | x | x | x | |outdim | x | x | x | x | x | x | x | x | x | |mean | x | x | x | x | x | x | x | ? | | @@ -30,21 +30,21 @@ Note: `?` refers to a possible implementation that is missing or called differen | Function \ Model | WHT | CCA | LDA |MC-LDA|SS-LDA| ICA | FA |PPCA | PCA |KPCA | MDS | |------------------|:---:|:---:|:---:|:----:|:----:|:---:|:---:|:---:|:---:|:---:|:---:| |fit | x | x | x | x | x | x | x | x | x | x | x | -|transform | x | x | - | - | - | x | x | x | x | x | x | -|predict | | | x | + | + | | | | | | | -|indim | - | | | - | - | x | x | x | x | x | x | -|outdim | - | x | | - | - | x | x | x | x | x | x | +|transform | x | x | - | - | - | x | x | x | - | x | - | +|predict | | | x | + | + | | | | + | | + | +|indim | - | | | - | - | x | x | x | - | x | - | +|outdim | - | x | | - | - | x | x | x | - | x | - | |mean | x | x | | x | x | x | x | x | x | ? | | |var | | | | | | | x | x | x | ? | ? | |cov | | | | | | | x | x | | | | |cor | | x | | | | | | | | | | |projection | ? | x | | x | x | | x | x | x | x | x | |reconstruct | | | | | | | x | x | x | x | | -|loadings | | ? | | | | | x | x | x | ? | ? | +|loadings | | ? | | | | | x | x | x | ? | + | |eigvals | | | | | + | | ? | ? | x | ? | x | -|eigvecs | | | | | | | ? | ? | x | ? | ? | +|eigvecs | | | | | | | ? | ? | x | ? | + | |length | + | | + | + | + | | | | | | | -|size | + | | | + | + | | | | x | | | +|size | + | | | + | + | | | | x | | + | | | | | | | | | | | | | | - StatsBase.AbstractDataTransform @@ -70,7 +70,7 @@ Note: `?` refers to a possible implementation that is missing or called differen - Methods: ICA, PCA - NonlinearDimensionalityReduction - Methods: KPCA, MDS - - Functions: modelmatrix (X), + - Functions: modelmatrix(X), - LatentVariableModel or LatentVariableDimensionalityReduction - Methods: FA, PPCA - Functions: cov diff --git a/docs/src/index.md b/docs/src/index.md index 7bc7063..34acfcf 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -11,7 +11,7 @@ end [MultivariateStats.jl](https://github.com/JuliaStats/MultivariateStats.jl) is a Julia package for multivariate statistical analysis. It provides a rich set of useful analysis techniques, such as PCA, CCA, LDA, ICA, etc. ```@contents -Pages = ["whiten.md", "lda.md", "pca.md", "api.md"] +Pages = ["whiten.md", "lda.md", "pca.md", "mds.md", "api.md"] Depth = 2 ``` diff --git a/docs/src/mds.md b/docs/src/mds.md new file mode 100644 index 0000000..ed75171 --- /dev/null +++ b/docs/src/mds.md @@ -0,0 +1,85 @@ +# Multidimensional Scaling + +In general, [Multidimensional Scaling](http://en.wikipedia.org/wiki/Multidimensional_scaling>) (MDS) +refers to techniques that transforms samples into lower dimensional space while +preserving the inter-sample distances as well as possible. + +## Example + + +Performing [`MDS`](@ref) on *Iris* data set: + +```@example MDSex +using MultivariateStats, RDatasets, Plots + +# load iris dataset +iris = dataset("datasets", "iris") + +# take half of the dataset +X = Matrix(iris[1:2:end,1:4])' +X_labels = Vector(iris[1:2:end,5]) +nothing # hide +``` + +Suppose `X` is our data matrix, with each observation in a column. +We train a MDS model, allowing up to 3 dimensions: + +```@example MDSex +M = fit(MDS, X; maxoutdim=3, distances=false) +``` + +Then, apply MDS model to get an embedding of our data in 3D space: + +```@example MDSex +Y = predict(M) +``` + +Now, we group results by testing set labels for color coding and visualize first +3 principal components in 3D interactive plot + +```@example MDSex +setosa = Y[:,X_labels.=="setosa"] +versicolor = Y[:,X_labels.=="versicolor"] +virginica = Y[:,X_labels.=="virginica"] + +p = scatter(setosa[1,:],setosa[2,:],setosa[3,:],marker=:circle,linewidth=0) +scatter!(versicolor[1,:],versicolor[2,:],versicolor[3,:],marker=:circle,linewidth=0) +scatter!(virginica[1,:],virginica[2,:],virginica[3,:],marker=:circle,linewidth=0) +``` + +## Classical Multidimensional Scaling +This package defines a `MDS` type to represent a classical MDS model [^1], +and provides a set of methods to access the properties. + +```@docs +MDS +``` + +The MDS method type comes with several methods where ``M`` be an instance of [`MDS`](@ref), +``d`` be the dimension of observations, and ``p`` be the output dimension, i.e. +the embedding dimension, and ``n`` is the number of the observations. + +```@docs +fit(::Type{MDS}, ::AbstractMatrix{T}; kwargs) where {T<:Real} +predict(::MDS) +predict(::MDS, ::AbstractVector{<:Real}) +size(::MDS) +projection(M::MDS) +loadings(M::MDS) +eigvals(M::MDS) +eigvecs(M::MDS) +stress +``` + +This package provides following functions related to classical MDS. +```@docs +gram2dmat +gram2dmat! +dmat2gram +dmat2gram! +``` + +## References + +[^1]: Ingwer Borg and Patrick J. F. Groenen, "Modern Multidimensional Scaling: Theory and Applications", Springer, pp. 201–268, 2005. + diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl index ce8a815..6cd1544 100644 --- a/src/MultivariateStats.jl +++ b/src/MultivariateStats.jl @@ -137,6 +137,11 @@ module MultivariateStats @deprecate outdim(f::PCA) size(f::PCA)[2] @deprecate tvar(f::PCA) var(f::PCA) # total variance @deprecate transform(f::PCA, x) predict(f::PCA, x) #ex=false + @deprecate classical_mds(D::AbstractMatrix, p::Int) predict(fit(MDS, D, maxoutdim=p, distances=true)) + @deprecate indim(f::MDS) size(f::MDS)[1] + @deprecate outdim(f::MDS) size(f::MDS)[2] + @deprecate transform(f::MDS) predict(f::MDS) + @deprecate transform(f::MDS, x) predict(f::MDS, x) # @deprecate transform(m, x; kwargs...) predict(m, x; kwargs...) #ex=false # @deprecate transform(m; kwargs...) predict(m; kwargs...) #ex=false diff --git a/src/cmds.jl b/src/cmds.jl index 9895e70..4578484 100644 --- a/src/cmds.jl +++ b/src/cmds.jl @@ -2,6 +2,11 @@ ## convert Gram matrix to Distance matrix +""" + gram2dmat!(D, G) + +Convert a Gram matrix `G` to a distance matrix, and write the results to `D`. +""" function gram2dmat!(D::AbstractMatrix{DT}, G::AbstractMatrix) where {DT<:Real} # argument checking m = size(G, 1) @@ -23,10 +28,20 @@ function gram2dmat!(D::AbstractMatrix{DT}, G::AbstractMatrix) where {DT<:Real} return D end +""" + gram2dmat(G) + +Convert a Gram matrix `G` to a distance matrix. +""" gram2dmat(G::AbstractMatrix{T}) where {T<:Real} = gram2dmat!(similar(G, T), G) ## convert Distance matrix to Gram matrix +""" + dmat2gram!(G, D) + +Convert a distance matrix `D` to a Gram matrix, and write the results to `G`. +""" function dmat2gram!(G::AbstractMatrix{GT}, D::AbstractMatrix) where GT # argument checking n = LinearAlgebra.checksquare(D) @@ -53,12 +68,46 @@ function dmat2gram!(G::AbstractMatrix{GT}, D::AbstractMatrix) where GT end momenttype(T) = typeof((zero(T) * zero(T) + zero(T) * zero(T))/ 2) + +""" + dmat2gram(D) + +Convert a distance matrix `D` to a Gram matrix. +""" dmat2gram(D::AbstractMatrix{T}) where {T<:Real} = dmat2gram!(similar(D, momenttype(T)), D) ## Classical MDS -"""Classical MDS type""" -struct MDS{T<:Real} +""" +*Classical Multidimensional Scaling* (MDS), also known as Principal Coordinates Analysis (PCoA), +is a specific technique in this family that accomplishes the embedding in two steps: + +1. Convert the distance matrix to a Gram matrix. This conversion is based on +the following relations between a distance matrix ``D`` and a Gram matrix ``G``: + +```math +\\mathrm{sqr}(\\mathbf{D}) = \\mathbf{g} \\mathbf{1}^T + \\mathbf{1} \\mathbf{g}^T - 2 \\mathbf{G} +``` + +Here, ``\\mathrm{sqr}(\\mathbf{D})`` indicates the element-wise square of ``\\mathbf{D}``, +and ``\\mathbf{g}`` is the diagonal elements of ``\\mathbf{G}``. This relation is +itself based on the following decomposition of squared Euclidean distance: + +```math +\\| \\mathbf{x} - \\mathbf{y} \\|^2 = \\| \\mathbf{x} \\|^2 + \\| \\mathbf{y} \\|^2 - 2 \\mathbf{x}^T \\mathbf{y} +``` + +2. Perform eigenvalue decomposition of the Gram matrix to derive the coordinates. + +*Note:* The Gramian derived from ``D`` may have non-positive or degenerate +eigenvalues. The subspace of non-positive eigenvalues is projected out +of the MDS solution so that the strain function is minimized in a +least-squares sense. If the smallest remaining eigenvalue that is used +for the MDS is degenerate, then the solution is not unique, as any +linear combination of degenerate eigenvectors will also yield a MDS +solution with the same strain value. +""" +struct MDS{T<:Real} <: NonlinearDimensionalityReduction d::Real # original dimension X::AbstractMatrix{T} # fitted data, X (d x n) λ::AbstractVector{T} # eigenvalues in feature space, (k x 1) @@ -66,17 +115,53 @@ struct MDS{T<:Real} end ## properties +""" + size(M) -indim(M::MDS) = M.d -outdim(M::MDS) = size(M.U,2) +Returns tuple where the first value is the MDS model `M` input dimension, +*i.e* the dimension of the observation space, and the second value is the output +dimension, *i.e* the dimension of the embedding. +""" +size(M::MDS) = (M.d, size(M.U,2)) + +""" + projection(M) +Get the MDS model `M` eigenvectors matrix (of size ``(n, p)``) of the embedding space. +The eigenvectors are arranged in descending order of the corresponding eigenvalues. +""" projection(M::MDS) = M.U + +""" + eigvecs(M::MDS) + +Get the MDS model `M` eigenvectors matrix. +""" +eigvecs(M::MDS) = projection(M) + +""" + eigvals(M::MDS) + +Get the eigenvalues of the MDS model `M`. +""" eigvals(M::MDS) = M.λ +""" + loadings(M::MDS) + +Get the loading of the MDS model `M`. +""" +loadings(M::MDS) = sqrt.(M.λ)' .* M.U + ## use -"""Calculate out-of-sample multidimensional scaling transformation""" -function transform(M::MDS{T}, x::AbstractVector{<:Real}; distances=false) where {T<:Real} +""" + predict(M, x::AbstractVector) + +Calculate the out-of-sample transformation of the observation `x` for the MDS model `M`. +Here, `x` is a vector of length `d`. +""" +function predict(M::MDS{T}, x::AbstractVector{<:Real}; distances=false) where {T<:Real} d = if isnan(M.d) # model has only distance matrix @assert distances "Cannot transform points if model was fitted with a distance matrix. Use point distances." size(x, 1) != size(M.X, 1) && throw( @@ -107,17 +192,24 @@ function transform(M::MDS{T}, x::AbstractVector{<:Real}; distances=false) where return M.U' * b ./ sqrt.(λ) end -"""Calculate multidimensional scaling transformation""" -function transform(M::MDS{T}) where {T<:Real} - # if there are non-positive missing eigval then padd with zeros - λ = vcat(M.λ, zeros(T, outdim(M) - length(M.λ))) +""" + predict(M) + +Returns a coordinate matrix of size ``(p, n)`` for the MDS model `M`, where each column +is the coordinates for an observation in the embedding space. +""" +function predict(M::MDS{T}) where {T<:Real} + d, p = size(M) + # if there are non-positive missing eigval then pad with zeros + λ = vcat(M.λ, zeros(T, p - length(M.λ))) return diagm(0=>sqrt.(λ)) * M.U' end ## show -function Base.show(io::IO, M::MDS) - print(io, "Classical MDS(indim = $(indim(M)), outdim = $(outdim(M)))") +function show(io::IO, M::MDS) + d, p = size(M) + print(io, "Classical MDS(indim = $d, outdim = $p)") end ## interface functions @@ -129,12 +221,12 @@ There are two calling options, specified via the required keyword argument `dist mds = fit(MDS, X; distances=false, maxdim=size(X,1)-1) -`X` is the data matrix. Distances between pairs of columns of `X` are computed using the Euclidean norm. +where `X` is the data matrix. Distances between pairs of columns of `X` are computed using the Euclidean norm. This is equivalent to performing PCA on `X`. - mds = fit(MDS, D; distances=true, maxdim=size(D,1)-1) + mds = fit(MDS, D; distances=true, maxdim=size(D,1)-1) -`D` is a symmetric matrix `D` of distances between points. +where `D` is a symmetric matrix `D` of distances between points. """ function fit(::Type{MDS}, X::AbstractMatrix{T}; maxoutdim::Int = size(X,1)-1, @@ -199,8 +291,12 @@ function fit(::Type{MDS}, X::AbstractMatrix{T}; return MDS(d, X, λ, U) end -@deprecate classical_mds(D::AbstractMatrix, p::Int) transform(fit(MDS, D, maxoutdim=p, distances=true)) +""" + stress(M) + +Get the stress of the MDS mode `M`. +""" function stress(M::MDS) # calculate distances if original data was stored DX = isnan(M.d) ? M.X : pairwise((x,y)->norm(x-y), eachcol(M.X), symmetric=true) @@ -208,3 +304,4 @@ function stress(M::MDS) n = size(DX,1) return sqrt(2*sum((DX - DY).^2)/sum(DX.^2)); end + diff --git a/test/cmds.jl b/test/cmds.jl index 77531f0..e412989 100644 --- a/test/cmds.jl +++ b/test/cmds.jl @@ -33,12 +33,12 @@ using Test @test length(eigvals(M)) == 3 @test stress(M) ≈ 0.0 atol = 1e-10 - X = transform(M) + X = predict(M) @test size(X) == (3,n) @test MultivariateStats.pairwise((x,y)->norm(x-y), eachcol(X0), symmetric=true) ≈ D0 - @test_throws DimensionMismatch transform(M, rand(d+1)) - y = transform(M, X0[:, 1]) + @test_throws DimensionMismatch predict(M, rand(d+1)) + y = predict(M, X0[:, 1]) @test X[:, 1] ≈ y # use only distance matrix @@ -47,24 +47,24 @@ using Test @test outdim(M) == 3 @test stress(M) ≈ 0.0 atol = 1e-10 - X = transform(M) + X = predict(M) @test size(X) == (3,n) @test MultivariateStats.pairwise((x,y)->norm(x-y), eachcol(X0), symmetric=true) ≈ D0 - @test_throws AssertionError transform(M, X0[:, 1]) - @test_throws DimensionMismatch transform(M, rand(d+1); distances = true) + @test_throws AssertionError predict(M, X0[:, 1]) + @test_throws DimensionMismatch predict(M, rand(d+1); distances = true) d = MultivariateStats.pairwise((x,y)->norm(x-y), X0, X0[:,2]) #|> vec - y = transform(M, d, distances=true) + y = predict(M, d, distances=true) @test X[:, 2] ≈ y #Test MDS embeddings in dimensions >= number of points M = fit(MDS, [0. 1.; 1. 0.], maxoutdim=2, distances=true) @test outdim(M) == 2 - @test transform(M) == [-0.5 0.5; 0 0] + @test predict(M) == [-0.5 0.5; 0 0] M = fit(MDS, [0. 1.; 1. 0.], maxoutdim=3, distances=true) @test outdim(M) == 3 - @test transform(M) == [-0.5 0.5; 0 0; 0 0] + @test predict(M) == [-0.5 0.5; 0 0; 0 0] #10 - dmat2gram produces negative definite matrix @@ -80,12 +80,12 @@ using Test 0.38095238095238093 0.4 0.21052631578947367 0.5454545454545454 0.3333333333333333 0.3181818181818182 0.23529411764705882 0.5555555555555556 0.4 1.0], Xt = [-0.27529104101488666 0.006134513718202863 0.33298809606740326 0.2608994458893664 -0.46185275796909575 -0.23734315039370618 0.29972782027671513 0.03827901455843394 -0.04096713097883363 0.07742518984640051 -0.08177061420820278 -0.0044504235228030225 -0.3271919093638943 0.28206254638779243 -0.0954706915166714 -0.07137742126520012 -0.30754933764853587 0.18582658369448027 -0.03715307349750036 0.45707434094053534]', - M = transform(fit(MDS, D, maxoutdim=2, distances=true))' + M = predict(fit(MDS, D, maxoutdim=2, distances=true))' @test M ≈ Xt .* cis.(angle.(sum(conj.(Xt) .* M, dims=1))) end #10 - test degenerate problem - @test transform(fit(MDS, zeros(10, 10), maxoutdim=3, distances=false)) == zeros(3, 10) + @test predict(fit(MDS, zeros(10, 10), maxoutdim=3, distances=false)) == zeros(3, 10) # out-of-sample D = [0 1 2 1; @@ -94,17 +94,17 @@ using Test 1 2 1 0.0f32] M = fit(MDS, sqrt.(D), maxoutdim=2, distances=true) - X = transform(M) + X = predict(M) @test D ≈ MultivariateStats.pairwise((x,y)->sum(abs2, x-y), eachcol(X), symmetric=true) @test eltype(X) == Float32 a = Float32[0.5, 0.5, 0.5, 0.5] A = vcat(hcat(D, a), hcat(a', zeros(Float32, 1, 1))) M⁺ = fit(MDS, sqrt.(A), maxoutdim=2, distances=true) - X⁺ = transform(M⁺) + X⁺ = predict(M⁺) @test A ≈ MultivariateStats.pairwise((x,y)->sum(abs2, x-y), eachcol(X⁺), symmetric=true) - y = transform(M, a, distances=true) + y = predict(M, a, distances=true) Y = [X y] @test A ≈ MultivariateStats.pairwise((x,y)->sum(abs2, x-y), eachcol(Y), symmetric=true) @test eltype(Y) == Float32 @@ -121,8 +121,8 @@ using Test MM = fit(MDS, XX, maxoutdim=3, distances=false) # test that mixing types doesn't error - transform(M, yy) - transform(MM, y) + predict(M, yy) + predict(MM, y) # type stability for func in (projection, eigvals, stress)