Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

making kmeans take AbstractMatrix instead of Matrix #138

Merged
merged 13 commits into from
Jan 29, 2019
48 changes: 25 additions & 23 deletions src/kmeans.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ const _kmeans_default_maxiter = 100
const _kmeans_default_tol = 1.0e-6
const _kmeans_default_display = :none

function kmeans!(X::Matrix{T}, centers::Matrix{T};
weights=nothing,
maxiter::Integer=_kmeans_default_maxiter,
function kmeans!(X::AbstractMatrix{T}, centers::AbstractMatrix{T};
weights=nothing, maxiter::Integer=_kmeans_default_maxiter,
tol::Real=_kmeans_default_tol,
display::Symbol=_kmeans_default_display,
distance::SemiMetric=SqEuclidean()) where T<:AbstractFloat
Expand All @@ -40,7 +39,7 @@ function kmeans!(X::Matrix{T}, centers::Matrix{T};
round(Int, maxiter), tol, display_level(display), distance)
tlienart marked this conversation as resolved.
Show resolved Hide resolved
end

function kmeans(X::Matrix{T}, k::Int;
function kmeans(X::AbstractMatrix{T}, k::Int;
weights=nothing,
init=_kmeans_default_init,
maxiter::Integer=_kmeans_default_maxiter,
Expand All @@ -64,17 +63,18 @@ end

# core k-means skeleton
function _kmeans!(
x::Matrix{T}, # in: sample matrix (d x n)
w::Union{Nothing, Vector{T}}, # in: sample weights (n)
centers::Matrix{T}, # in/out: matrix of centers (d x k)
assignments::Vector{Int}, # out: vector of assignments (n)
costs::Vector{T}, # out: costs of the resultant assignments (n)
counts::Vector{Int}, # out: the number of samples assigned to each cluster (k)
cweights::Vector{Float64}, # out: the weights of each cluster
maxiter::Int, # in: maximum number of iterations
tol::Real, # in: tolerance of change at convergence
displevel::Int, # in: the level of display
distance::SemiMetric) where T<:AbstractFloat # in: function to calculate the distance with
x::AbstractMatrix{T}, # in: sample matrix (d x n)
w::Union{Nothing, Vector{T}}, # in: sample weights (n)
tlienart marked this conversation as resolved.
Show resolved Hide resolved
centers::AbstractMatrix{T}, # in/out: matrix of centers (d x k)
assignments::Vector{Int}, # out: vector of assignments (n)
costs::Vector{T}, # out: costs of the resultant assignments (n)
counts::Vector{Int}, # out: the number of samples assigned to each cluster (k)
cweights::Vector{Float64}, # out: the weights of each cluster
tlienart marked this conversation as resolved.
Show resolved Hide resolved
maxiter::Int, # in: maximum number of iterations
tlienart marked this conversation as resolved.
Show resolved Hide resolved
tol::Real, # in: tolerance of change at convergence
displevel::Int, # in: the level of display
distance::SemiMetric # in: function to calculate the distance with
) where T<:AbstractFloat

# initialize

Expand Down Expand Up @@ -157,6 +157,8 @@ function _kmeans!(
end
end

centers isa Matrix || (centers = copy(centers))
tlienart marked this conversation as resolved.
Show resolved Hide resolved

return KmeansResult(centers, assignments, costs, counts, cweights,
Float64(objv), t, converged)
end
Expand Down Expand Up @@ -238,11 +240,11 @@ end
# (specific to the case where samples are not weighted)
#
function update_centers!(
x::Matrix{T}, # in: sample matrix (d x n)
w::Nothing, # in: sample weights
x::AbstractMatrix{T}, # in: sample matrix (d x n)
w::Nothing, # in: sample weights
assignments::Vector{Int}, # in: assignments (n)
to_update::Vector{Bool}, # in: whether a center needs update (k)
centers::Matrix{T}, # out: updated centers (d x k)
centers::AbstractMatrix{T}, # out: updated centers (d x k)
cweights::Vector) where T<:AbstractFloat # out: updated cluster weights (k)

d::Int = size(x, 1)
Expand Down Expand Up @@ -292,11 +294,11 @@ end
# (specific to the case where samples are weighted)
#
function update_centers!(
x::Matrix{T}, # in: sample matrix (d x n)
x::AbstractMatrix{T}, # in: sample matrix (d x n)
weights::Vector{T}, # in: sample weights (n)
assignments::Vector{Int}, # in: assignments (n)
to_update::Vector{Bool}, # in: whether a center needs update (k)
centers::Matrix{T}, # out: updated centers (d x k)
centers::AbstractMatrix{T}, # out: updated centers (d x k)
cweights::Vector # out: updated cluster weights (k)
) where T<:AbstractFloat

Expand Down Expand Up @@ -342,9 +344,9 @@ end
# Re-picks centers that get no samples assigned to them.
#
function repick_unused_centers(
x::Matrix{T}, # in: the sample set (d x n)
costs::Vector{T}, # in: the current assignment costs (n)
centers::Matrix{T}, # to be updated: the centers (d x k)
x::AbstractMatrix{T}, # in: the sample set (d x n)
costs::Vector{T}, # in: the current assignment costs (n)
centers::AbstractMatrix{T}, # to be updated: the centers (d x k)
unused::Vector{Int}) where T<:AbstractFloat # in: the set of indices of centers to be updated

# pick new centers using a scheme like kmeans++
Expand Down
41 changes: 20 additions & 21 deletions src/seeding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@

abstract type SeedingAlgorithm end

initseeds(alg::SeedingAlgorithm, X::RealMatrix, k::Integer) =
initseeds(alg::SeedingAlgorithm, X::AbstractMatrix{T}, k::Integer) where T<:AbstractFloat =
tlienart marked this conversation as resolved.
Show resolved Hide resolved
initseeds!(Vector{Int}(undef, k), alg, X)

initseeds_by_costs(alg::SeedingAlgorithm, costs::RealMatrix, k::Integer) =
initseeds_by_costs(alg::SeedingAlgorithm, costs::AbstractMatrix{T}, k::Integer) where T<:AbstractFloat =
initseeds_by_costs!(Vector{Int}(undef, k), alg, costs)

seeding_algorithm(s::Symbol) =
Expand All @@ -31,16 +31,16 @@ seeding_algorithm(s::Symbol) =
s == :kmcen ? KmCentralityAlg() :
error("Unknown seeding algorithm $s")

initseeds(algname::Symbol, X::RealMatrix, k::Integer) =
initseeds(algname::Symbol, X::AbstractMatrix{T}, k::Integer) where T<:AbstractFloat =
initseeds(seeding_algorithm(algname), X, k)::Vector{Int}

initseeds_by_costs(algname::Symbol, costs::RealMatrix, k::Integer) =
initseeds_by_costs(algname::Symbol, costs::AbstractMatrix{T}, k::Integer) where T<:AbstractFloat =
initseeds_by_costs(seeding_algorithm(algname), costs, k)

initseeds(iseeds::Vector{Int}, X::RealMatrix, k::Integer) = iseeds
initseeds_by_costs(iseeds::Vector{Int}, costs::RealMatrix, k::Integer) = iseeds
initseeds(iseeds::Vector{Int}, X::AbstractMatrix{T}, k::Integer) where T<:AbstractFloat = iseeds
initseeds_by_costs(iseeds::Vector{Int}, costs::AbstractMatrix{T}, k::Integer) where T<:AbstractFloat = iseeds

function copyseeds!(S::DenseMatrix, X::DenseMatrix, iseeds::AbstractVector)
function copyseeds!(S::DenseMatrix, X::AbstractMatrix{T}, iseeds::AbstractVector) where T<:AbstractFloat
d = size(X, 1)
n = size(X, 2)
k = length(iseeds)
Expand All @@ -53,7 +53,7 @@ function copyseeds!(S::DenseMatrix, X::DenseMatrix, iseeds::AbstractVector)
return S
end

copyseeds(X::DenseMatrix{T}, iseeds::AbstractVector) where {T} =
copyseeds(X::AbstractMatrix{T}, iseeds::AbstractVector) where {T} =
copyseeds!(Matrix{T}(undef, size(X,1), length(iseeds)), X, iseeds)

function check_seeding_args(n::Integer, k::Integer)
Expand All @@ -69,11 +69,9 @@ end

mutable struct RandSeedAlg <: SeedingAlgorithm end

initseeds!(iseeds::IntegerVector, alg::RandSeedAlg, X::RealMatrix) =
tlienart marked this conversation as resolved.
Show resolved Hide resolved
sample!(1:size(X,2), iseeds; replace=false)
initseeds!(iseeds::IntegerVector, alg::RandSeedAlg, X::AbstractMatrix{T}) where T<:AbstractFloat = sample!(1:size(X,2), iseeds; replace=false)

initseeds_by_costs!(iseeds::IntegerVector, alg::RandSeedAlg, X::RealMatrix) =
sample!(1:size(X,2), iseeds; replace=false)
initseeds_by_costs!(iseeds::IntegerVector, alg::RandSeedAlg, X::AbstractMatrix{T}) where T<:AbstractFloat = sample!(1:size(X,2), iseeds; replace=false)


# Kmeans++ seeding
Expand All @@ -85,7 +83,7 @@ initseeds_by_costs!(iseeds::IntegerVector, alg::RandSeedAlg, X::RealMatrix) =

mutable struct KmppAlg <: SeedingAlgorithm end

function initseeds!(iseeds::IntegerVector, alg::KmppAlg, X::RealMatrix, metric::PreMetric)
function initseeds!(iseeds::IntegerVector, alg::KmppAlg, X::AbstractMatrix{T}, metric::PreMetric) where T <: AbstractFloat
n = size(X, 2)
k = length(iseeds)
check_seeding_args(n, k)
Expand Down Expand Up @@ -115,10 +113,9 @@ function initseeds!(iseeds::IntegerVector, alg::KmppAlg, X::RealMatrix, metric::
return iseeds
end

initseeds!(iseeds::IntegerVector, alg::KmppAlg, X::RealMatrix) =
initseeds!(iseeds, alg, X, SqEuclidean())
initseeds!(iseeds::IntegerVector, alg::KmppAlg, X::AbstractMatrix{T}) where T<:AbstractFloat = initseeds!(iseeds, alg, X, SqEuclidean())

function initseeds_by_costs!(iseeds::IntegerVector, alg::KmppAlg, costs::RealMatrix)
function initseeds_by_costs!(iseeds::IntegerVector, alg::KmppAlg, costs::AbstractMatrix{T}) where T<:AbstractFloat
n = size(costs, 1)
k = length(iseeds)
check_seeding_args(n, k)
Expand All @@ -145,8 +142,10 @@ function initseeds_by_costs!(iseeds::IntegerVector, alg::KmppAlg, costs::RealMat
return iseeds
end

kmpp(X::RealMatrix, k::Int) = initseeds(KmppAlg(), X, k)
kmpp_by_costs(costs::RealMatrix, k::Int) = initseeds(KmppAlg(), costs, k)
kmpp(X::AbstractMatrix{T}, k::Int) where T<:AbstractFloat =
initseeds(KmppAlg(), X, k)
kmpp_by_costs(costs::AbstractMatrix{T}, k::Int) where T<:AbstractFloat =
initseeds(KmppAlg(), costs, k)


# K-medoids initialization based on centrality
Expand All @@ -158,7 +157,7 @@ kmpp_by_costs(costs::RealMatrix, k::Int) = initseeds(KmppAlg(), costs, k)

mutable struct KmCentralityAlg <: SeedingAlgorithm end

function initseeds_by_costs!(iseeds::IntegerVector, alg::KmCentralityAlg, costs::RealMatrix)
function initseeds_by_costs!(iseeds::IntegerVector, alg::KmCentralityAlg, costs::AbstractMatrix{T}) where T<:AbstractFloat
n = size(costs, 1)
k = length(iseeds)
k <= n || error("Attempted to select more seeds than samples.")
Expand All @@ -183,8 +182,8 @@ function initseeds_by_costs!(iseeds::IntegerVector, alg::KmCentralityAlg, costs:
return iseeds
end

initseeds!(iseeds::IntegerVector, alg::KmCentralityAlg, X::RealMatrix, metric::PreMetric) =
initseeds!(iseeds::IntegerVector, alg::KmCentralityAlg, X::AbstractMatrix{T}, metric::PreMetric) where T<:AbstractFloat =
initseeds_by_costs!(iseeds, alg, pairwise(metric, X))

initseeds!(iseeds::IntegerVector, alg::KmCentralityAlg, X::RealMatrix) =
initseeds!(iseeds::IntegerVector, alg::KmCentralityAlg, X::AbstractMatrix{T}) where T<:AbstractFloat =
initseeds!(iseeds, alg, X, SqEuclidean())
50 changes: 50 additions & 0 deletions test/kmeans.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ n = 1000
k = 10

x = rand(m, n)
xt = copy(transpose(x))

@testset "non-weighted" begin
r = kmeans(x, k; maxiter=50)
Expand All @@ -42,6 +43,19 @@ x = rand(m, n)
@test sum(r.costs) ≈ r.totalcost
end

@testset "non-weighted^T" begin
r = kmeans(xt', k; maxiter=50)
@test isa(r, KmeansResult{Float64})
tlienart marked this conversation as resolved.
Show resolved Hide resolved
@test size(r.centers) == (m, k)
@test length(r.assignments) == n
@test all(a -> 1 <= a <= k, r.assignments)
@test length(r.costs) == n
@test length(r.counts) == k
@test sum(r.counts) == n
@test r.cweights == map(Float64, r.counts)
@test sum(r.costs) ≈ r.totalcost
end

@testset "non-weighted (float32)" begin
r = kmeans(map(Float32, x), k; maxiter=50)
@test isa(r, KmeansResult{Float32})
Expand Down Expand Up @@ -74,6 +88,25 @@ end
@test dot(r.costs, w) ≈ r.totalcost
end

@testset "weighted^T" begin
w = rand(n)
r = kmeans(xt', k; maxiter=50, weights=w)
@test isa(r, KmeansResult{Float64})
@test size(r.centers) == (m, k)
@test length(r.assignments) == n
@test all(a -> 1 <= a <= k, r.assignments)
@test length(r.costs) == n
@test length(r.counts) == k
@test sum(r.counts) == n

cw = zeros(k)
for i = 1:n
cw[r.assignments[i]] += w[i]
end
@test r.cweights ≈ cw
@test dot(r.costs, w) ≈ r.totalcost
end

@testset "custom distance" begin
r = kmeans(x, k; maxiter=50, init=:kmcen, distance=MySqEuclidean())
r2 = kmeans(x, k; maxiter=50, init=:kmcen)
Expand All @@ -91,4 +124,21 @@ end
end
end

@testset "custom distance^T" begin
r = kmeans(xt', k; maxiter=50, init=:kmcen, distance=MySqEuclidean())
r2 = kmeans(xt', k; maxiter=50, init=:kmcen)
@test isa(r, KmeansResult{Float64})
@test size(r.centers) == (m, k)
@test length(r.assignments) == n
@test all(a -> 1 <= a <= k, r.assignments)
@test length(r.costs) == n
@test length(r.counts) == k
@test sum(r.counts) == n
@test r.cweights == map(Float64, r.counts)
@test sum(r.costs) ≈ r.totalcost
for fn in fieldnames(typeof(r))
@test getfield(r, fn) == getfield(r2, fn)
end
end

end
51 changes: 50 additions & 1 deletion test/seeding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Random.seed!(34568)

alldistinct(x::Vector{Int}) = (length(Set(x)) == length(x))

function min_interdist(X::Matrix)
function min_interdist(X::AbstractMatrix)
dists = pairwise(SqEuclidean(), X)
n = size(X, 2)
r = Inf
Expand All @@ -31,6 +31,9 @@ k = 5
X = rand(d, n)
C = pairwise(SqEuclidean(), X)

Xt = copy(transpose(X))
Ct = copy(transpose(C))

md0 = min_interdist(X)

@testset "RandSeed" begin
Expand All @@ -47,6 +50,20 @@ md0 = min_interdist(X)
@test R == X[:, iseeds]
end

@testset "RandSeed^T" begin
iseeds = initseeds(RandSeedAlg(), Xt', k)
@test length(iseeds) == k
@test alldistinct(iseeds)

iseeds = initseeds_by_costs(RandSeedAlg(), Ct', k)
@test length(iseeds) == k
@test alldistinct(iseeds)

R = copyseeds(Xt', iseeds)
@test isa(R, Matrix{Float64})
@test R == (Xt')[:, iseeds]
end

@testset "Kmpp" begin
iseeds = initseeds(KmppAlg(), X, k)
@test length(iseeds) == k
Expand All @@ -67,6 +84,26 @@ end
@test alldistinct(iseeds)
end

@testset "Kmpp^T" begin
iseeds = initseeds(KmppAlg(), Xt', k)
@test length(iseeds) == k
@test alldistinct(iseeds)

iseeds = initseeds_by_costs(KmppAlg(), Ct', k)
@test length(iseeds) == k
@test alldistinct(iseeds)

@test min_interdist((Xt')[:, iseeds]) > 20 * md0

iseeds = kmpp(Xt', k)
@test length(iseeds) == k
@test alldistinct(iseeds)

iseeds = kmpp_by_costs(Ct', k)
@test length(iseeds) == k
@test alldistinct(iseeds)
end

@testset "Kmcentrality" begin
iseeds = initseeds(KmCentralityAlg(), X, k)
@test length(iseeds) == k
Expand All @@ -79,4 +116,16 @@ end
@test min_interdist(X[:, iseeds]) > 2 * md0
end

@testset "Kmcentrality^T" begin
iseeds = initseeds(KmCentralityAlg(), Xt', k)
@test length(iseeds) == k
@test alldistinct(iseeds)

iseeds = initseeds_by_costs(KmCentralityAlg(), Ct', k)
@test length(iseeds) == k
@test alldistinct(iseeds)

@test min_interdist((Xt')[:, iseeds]) > 2 * md0
end

end