diff --git a/Project.toml b/Project.toml index 9ef8448b..ec189ee3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Clustering" uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" -version = "0.15.3" +version = "0.15.4" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" @@ -13,7 +13,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -Distances = "0.8, 0.9, 0.10" +Distances = "0.10.9" NearestNeighbors = "0.4" StatsBase = "0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34" julia = "1" diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 91fbcded..a707beda 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -29,3 +29,22 @@ SUITE["cutree"] = BenchmarkGroup() for (n, k) in ((10, 3), (100, 10), (1000, 100), (10000, 1000)) SUITE["cutree"][(n,k)] = @benchmarkable cutree(hclu, k=$k) setup=(D=random_distance_matrix($n, 5); hclu=hclust(D, linkage=:single)) end + +function silhouette_benchmark(metric, assgns, points, nclusters) + res = BenchmarkGroup() + res[:distances] = @benchmarkable silhouettes($assgns, pairwise($metric, $points, $points, dims=2)) + res[:points] = @benchmarkable silhouettes($assgns, $points; metric=$metric) + return res +end + +SUITE["silhouette"] = BenchmarkGroup() +for metric in [SqEuclidean(), Euclidean()] + SUITE["silhouette"]["metric=$(typeof(metric))"] = metric_bench = BenchmarkGroup() + for n in [100, 1000, 10000, 20000] + nclusters = 10 + dims = 10 + points = randn(dims, n) + assgns = rand(1:nclusters, n) + metric_bench["n=$n"] = silhouette_benchmark(metric, assgns, points, nclusters) + end +end \ No newline at end of file diff --git a/docs/source/fuzzycmeans.md b/docs/source/fuzzycmeans.md index f75ef27e..67515e3b 100644 --- a/docs/source/fuzzycmeans.md +++ b/docs/source/fuzzycmeans.md @@ -19,7 +19,7 @@ and ``m > 1`` is a user-defined fuzziness parameter. ```@docs fuzzy_cmeans FuzzyCMeansResult -wcounts(::FuzzyCMeansResult) +wcounts ``` ## Examples diff --git a/src/Clustering.jl b/src/Clustering.jl index 0a03429a..1c397088 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -88,6 +88,8 @@ module Clustering include("counts.jl") + include("cluster_distances.jl") + include("silhouette.jl") include("clustering_quality.jl") diff --git a/src/affprop.jl b/src/affprop.jl index e8ea51d3..113b46a2 100644 --- a/src/affprop.jl +++ b/src/affprop.jl @@ -118,13 +118,17 @@ function _affinityprop(S::AbstractMatrix{T}, # extract exemplars and assignments exemplars = _afp_extract_exemplars(A, R) + if isempty(exemplars) + @show A R + end + @assert !isempty(exemplars) assignments, counts = _afp_get_assignments(S, exemplars) if displevel >= 1 if converged - println("Affinity propagation converged with $t iterations: $(length(exemplars)) exemplars.") + @info "Affinity propagation converged with $t iterations: $(length(exemplars)) exemplars." else - println("Affinity propagation terminated without convergence after $t iterations: $(length(exemplars)) exemplars.") + @warn "Affinity propagation terminated without convergence after $t iterations: $(length(exemplars)) exemplars." end end @@ -250,7 +254,6 @@ function _afp_get_assignments(S::AbstractMatrix, exemplars::Vector{Int}) k = length(exemplars) Se = S[:, exemplars] a = Vector{Int}(undef, n) - cnts = zeros(Int, k) for i = 1:n p = 1 v = Se[i,1] @@ -263,11 +266,10 @@ function _afp_get_assignments(S::AbstractMatrix, exemplars::Vector{Int}) end a[i] = p end - for i = 1:k - a[exemplars[i]] = i - end - for i = 1:n - @inbounds cnts[a[i]] += 1 + a[exemplars] = eachindex(exemplars) + cnts = zeros(Int, k) + for aa in a + @inbounds cnts[aa] += 1 end return (a, cnts) end diff --git a/src/cluster_distances.jl b/src/cluster_distances.jl new file mode 100644 index 00000000..13ad712c --- /dev/null +++ b/src/cluster_distances.jl @@ -0,0 +1,226 @@ +#=== +Base type for efficient computation of average(mean) distances +from the cluster centers to a given point. + +The descendant types should implement the following methods: + * `update!(dists, assignments, points)`: update the internal + state of `dists` with point coordinates and their assignments to the clusters + * `sumdistances(dists, points, indices)`: compute the sum of + distances from all `dists` clusters to `points` +===# +abstract type ClusterDistances{T} end + +# create empty ClusterDistances object for a given metric +# and update it with a given clustering +# if batch_size is specified, the updates are done in point batches of given size +function ClusterDistances(metric::SemiMetric, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix{<:Real}, + batch_size::Union{Integer, Nothing} = nothing) + update!(ClusterDistances(eltype(points), metric, length(assignments), size(points, 1), + maximum(assignments)), + assignments, points, batch_size) +end + +ClusterDistances(metric, R::ClusteringResult, args...) = + ClusterDistances(metric, assignments(R), args...) + +# fallback implementations of ClusteringDistances methods + +cluster_sizes(dists::ClusterDistances) = dists.cluster_sizes +nclusters(dists::ClusterDistances) = length(cluster_sizes(dists)) + +update!(dists::ClusterDistances, + assignments::AbstractVector, points::AbstractMatrix) = + error("update!(dists::$(typeof(dists))) not implemented") + +sumdistances(dists::ClusterDistances, + points::Union{AbstractMatrix, Nothing}, + indices::Union{AbstractVector{<:Integer}, Nothing}) = + error("sumdistances(dists::$(typeof(dists))) not implemented") + +# average distances from each cluster to each point, nclusters×n matrix +function meandistances(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, + points::Union{AbstractMatrix, Nothing}, + indices::AbstractVector{<:Integer}) + @assert length(assignments) == length(indices) + (points === nothing) || @assert(size(points, 2) == length(indices)) + clu_to_pt = sumdistances(dists, points, indices) + clu_sizes = cluster_sizes(dists) + @assert length(assignments) == length(indices) + @assert size(clu_to_pt) == (length(clu_sizes), length(assignments)) + + # normalize distances by cluster sizes + @inbounds for j in eachindex(assignments) + for (i, c) in enumerate(clu_sizes) + if i == assignments[j] + c -= 1 + end + if c == 0 + clu_to_pt[i,j] = 0 + else + clu_to_pt[i,j] /= c + end + end + end + return clu_to_pt +end + +# wrapper for ClusteringResult +update!(dists::ClusterDistances, R::ClusteringResult, args...) = + update!(dists, assignments(R), args...) + +# batch-update silhouette dists (splitting the points into chunks of batch_size size) +function update!(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, points::AbstractMatrix{<:Real}, + batch_size::Union{Integer, Nothing}) + n = size(points, 2) + ((batch_size === nothing) || (n <= batch_size)) && + return update!(dists, assignments, points) + + for batch_start in 1:batch_size:n + batch_ixs = batch_start:min(batch_start + batch_size - 1, n) + update!(dists, view(assignments, batch_ixs), view(points, :, batch_ixs)) + end + return dists +end + +# generic ClusterDistances implementation for an arbitrary metric M +# if M is Nothing, point_dists is an arbitrary matrix of point distances +struct SimpleClusterDistances{M, T} <: ClusterDistances{T} + metric::M + cluster_sizes::Vector{Int} + assignments::Vector{Int} + point_dists::Matrix{T} + + SimpleClusterDistances(::Type{T}, metric::M, + npoints::Integer, nclusters::Integer) where {M<:Union{SemiMetric, Nothing}, T<:Real} = + new{M, T}(metric, zeros(Int, nclusters), Vector{Int}(), + Matrix{T}(undef, npoints, npoints)) + + # reuse given points matrix + function SimpleClusterDistances( + metric::Nothing, + assignments::AbstractVector{<:Integer}, + point_dists::AbstractMatrix{T} + ) where T<:Real + n = length(assignments) + size(point_dists) == (n, n) || throw(DimensionMismatch("assignments length ($n) does not match distances matrix size ($(size(point_dists)))")) + issymmetric(point_dists) || throw(ArgumentError("point distances matrix must be symmetric")) + clu_sizes = zeros(Int, maximum(assignments)) + @inbounds for cluster in assignments + clu_sizes[cluster] += 1 + end + new{Nothing, T}(metric, clu_sizes, assignments, point_dists) + end +end + +# fallback ClusterDistances constructor +ClusterDistances(::Type{T}, metric::Union{SemiMetric, Nothing}, + npoints::Union{Integer, Nothing}, dims::Integer, nclusters::Integer) where T<:Real = + SimpleClusterDistances(T, metric, npoints, nclusters) + +# when metric is nothing, points is the matrix of distances +function ClusterDistances(metric::Nothing, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix, + batch_size::Union{Integer, Nothing} = nothing) + (batch_size === nothing) || (size(points, 2) > batch_size) || + error("batch-updates of distance matrix-based ClusterDistances not supported") + SimpleClusterDistances(metric, assignments, points) +end + +function update!(dists::SimpleClusterDistances{M}, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix{<:Real}) where M + @assert length(assignments) == size(points, 2) + check_assignments(assignments, nclusters(dists)) + append!(dists.assignments, assignments) + n = size(dists.point_dists, 1) + length(dists.assignments) == n || + error("$(typeof(dists)) does not support batch updates: $(length(assignments)) points given, $n expected") + @inbounds for cluster in assignments + dists.cluster_sizes[cluster] += 1 + end + + if M === Nothing + size(point_dists) == (n, n) || + throw(DimensionMismatch("points should be a point-to-point distances matrix of ($n, $n) size, $(size(points)) given")) + copy!(dists.point_dists, point_dists) + else + # metric-based SimpleClusterDistances does not support batched updates + size(points, 2) == n || + throw(DimensionMismatch("points should be a point coordinates matrix with $n columns, $(size(points, 2)) found")) + pairwise!(dists.metric, dists.point_dists, points, dims=2) + end + + return dists +end + +# this function returns matrix r nclusters x n, such that +# r[i, j] is the sum of distances from all i-th cluster points to the point indices[j] +function sumdistances(dists::SimpleClusterDistances, + points::Union{AbstractMatrix, Nothing}, # unused as distances are already in point_dists + indices::AbstractVector{<:Integer}) + T = eltype(dists.point_dists) + n = length(dists.assignments) + S = typeof((one(T)+one(T))/2) + r = zeros(S, nclusters(dists), n) + @inbounds for (jj, j) in enumerate(indices) + for i = 1:j-1 + r[dists.assignments[i], jj] += dists.point_dists[i,j] + end + for i = j+1:n + r[dists.assignments[i], jj] += dists.point_dists[i,j] + end + end + return r +end + +# uses the method from "Distributed Silhouette Algorithm: Evaluating Clustering on Big Data" +# https://arxiv.org/abs/2303.14102 +# for SqEuclidean point distances +struct SqEuclideanClusterDistances{T} <: ClusterDistances{T} + cluster_sizes::Vector{Int} # [nclusters] + Y::Matrix{T} # [dims, nclusters], the first moments of each cluster (sum of point coords) + Ψ::Vector{T} # [nclusters], the second moments of each cluster (sum of point coord squares) + + SqEuclideanClusterDistances(::Type{T}, npoints::Union{Integer, Nothing}, dims::Integer, + nclusters::Integer) where T<:Real = + new{T}(zeros(Int, nclusters), zeros(T, dims, nclusters), zeros(T, nclusters)) +end + +ClusterDistances(::Type{T}, metric::SqEuclidean, npoints::Union{Integer, Nothing}, + dims::Integer, nclusters::Integer) where T<:Real = + SqEuclideanClusterDistances(T, npoints, dims, nclusters) + +function update!(dists::SqEuclideanClusterDistances, + assignments::AbstractVector{<:Integer}, + points::AbstractMatrix{<:Real}) + # x dims are [D,N] + d, n = size(points) + k = length(cluster_sizes(dists)) + check_assignments(assignments, k) + n == length(assignments) || throw(DimensionMismatch("points count ($n) does not match assignments length $(length(assignments)))")) + d == size(dists.Y, 1) || throw(DimensionMismatch("points dims ($(size(points, 1))) do no must match ClusterDistances dims ($(size(dists.Y, 1)))")) + # precompute moments and update counts + @inbounds for (i, cluster) in enumerate(assignments) + point = view(points, :, i) # switch to eachslice() once Julia-1.0 support is dropped + dists.cluster_sizes[cluster] += 1 + dists.Y[:, cluster] .+= point + dists.Ψ[cluster] += sum(abs2, point) + end + return dists +end + +# sum distances from each cluster to each point in `points`, [nclusters, n] +function sumdistances(dists::SqEuclideanClusterDistances, + points::AbstractMatrix, + indices::AbstractVector{<:Integer}) + @assert size(points, 2) == length(indices) + point_norms = sum(abs2, points; dims=1) # [1,n] + return dists.cluster_sizes .* point_norms .+ + reshape(dists.Ψ, nclusters(dists), 1) .- + 2 * (transpose(dists.Y) * points) +end diff --git a/src/hclust.jl b/src/hclust.jl index f67d3cce..0da64b7e 100644 --- a/src/hclust.jl +++ b/src/hclust.jl @@ -218,7 +218,7 @@ function hclust_n3(d::AbstractMatrix, linkage::Function) return htre.merges end -""" +#=== ReducibleMetric{T<:Real} Base type for _reducible_ Lance–Williams cluster metrics. @@ -235,7 +235,7 @@ d(A∪B, C) > ρ If the cluster metrics belongs to Lance-Williams family, there is an efficient formula that defines `d(A∪B, C)` using `d(A, C)`, `d(B, C)` and `d(A, B)`. -""" +===# abstract type ReducibleMetric{T <: Real} end # due to reducibility, new_dki=d[k,i∪j] distance should not be less than @@ -243,12 +243,12 @@ abstract type ReducibleMetric{T <: Real} end # arithmetic errors in Lance-Williams formula @inline clamp_reducible_metric(new_dki, dki, dkj) = max(new_dki, min(dki, dkj)) -""" +#=== MinimalDistance <: ReducibleMetric Distance between the clusters is the minimal distance between any pair of their points. -""" +===# struct MinimalDistance{T} <: ReducibleMetric{T} MinimalDistance(d::AbstractMatrix{T}) where T<:Real = new{T}() end @@ -261,13 +261,13 @@ end ) where T = (d[k, i] > d_kj) && (d[k, i] = d_kj) -""" +#=== WardDistance <: ReducibleMetric Ward distance between the two clusters `A` and `B` is the amount by which merging the two clusters into a single larger cluster `A∪B` would increase the average squared distance of a point to its cluster centroid. -""" +===# struct WardDistance{T} <: ReducibleMetric{T} WardDistance(d::AbstractMatrix{T}) where T<:Real = new{typeof(one(T)/2)}() end @@ -283,11 +283,11 @@ end d_ki, d_kj) end -""" +#=== AverageDistance <: ReducibleMetric Average distance between a pair of points from each clusters. -""" +===# struct AverageDistance{T} <: ReducibleMetric{T} AverageDistance(d::AbstractMatrix{T}) where T<:Real = new{typeof(one(T)/2)}() end @@ -303,11 +303,11 @@ end d[k, i] = clamp_reducible_metric((ni * d_ki + nj * d_kj) / nij, d_ki, d_kj) end -""" +#=== MaximumDistance <: ReducibleMetric Maximum distance between a pair of points from each clusters. -""" +===# struct MaximumDistance{T} <: ReducibleMetric{T} MaximumDistance(d::AbstractMatrix{T}) where T<:Real = new{T}() end diff --git a/src/kmeans.jl b/src/kmeans.jl index 3f96403f..e3720e83 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -169,11 +169,11 @@ function _kmeans!(X::AbstractMatrix{<:Real}, # in: data matrix (d end if t == 1 || num_affected > 0.75 * k - pairwise!(dmat, distance, centers, X, dims=2) + pairwise!(distance, dmat, centers, X, dims=2) else # if only a small subset is affected, only compute for that subset affected_inds = findall(to_update) - pairwise!(view(dmat, affected_inds, :), distance, + pairwise!(distance, view(dmat, affected_inds, :), view(centers, :, affected_inds), X, dims=2) end @@ -387,7 +387,7 @@ function repick_unused_centers(X::AbstractMatrix{<:Real}, # in: the data matrix tcosts[j] = 0 v = view(X, :, j) centers[:, i] = v - colwise!(ds, distance, v, X) + colwise!(distance, ds, v, X) tcosts = min(tcosts, ds) end end diff --git a/src/seeding.jl b/src/seeding.jl index 8a82794d..4e0cb3f3 100644 --- a/src/seeding.jl +++ b/src/seeding.jl @@ -181,7 +181,7 @@ function initseeds!(iseeds::AbstractVector{<:Integer}, alg::KmppAlg, # update mincosts c = view(X, :, p) - colwise!(tmpcosts, metric, X, view(X, :, p)) + colwise!(metric, tmpcosts, X, view(X, :, p)) updatemin!(mincosts, tmpcosts) mincosts[p] = 0 end diff --git a/src/silhouette.jl b/src/silhouette.jl index fc9622a2..61600e62 100644 --- a/src/silhouette.jl +++ b/src/silhouette.jl @@ -1,90 +1,26 @@ # Silhouette -# this function returns r of size (k, n), such that -# r[i, j] is the sum of distances of all points from cluster i to point j -# -function sil_aggregate_dists(k::Int, a::AbstractVector{Int}, dists::AbstractMatrix{T}) where T<:Real - n = length(a) - S = typeof((one(T)+one(T))/2) - r = zeros(S, k, n) - @inbounds for j = 1:n - for i = 1:j-1 - r[a[i],j] += dists[i,j] - end - for i = j+1:n - r[a[i],j] += dists[i,j] - end - end - return r -end - - -""" - silhouettes(assignments::AbstractVector, [counts,] dists) -> Vector{Float64} - silhouettes(clustering::ClusteringResult, dists) -> Vector{Float64} - -Compute *silhouette* values for individual points w.r.t. given clustering. - -Returns the ``n``-length vector of silhouette values for each individual point. - -# Arguments - - `assignments::AbstractVector{Int}`: the vector of point assignments - (cluster indices) - - `counts::AbstractVector{Int}`: the optional vector of cluster sizes (how many - points assigned to each cluster; should match `assignments`) - - `clustering::ClusteringResult`: the output of some clustering method - - `dists::AbstractMatrix`: ``n×n`` matrix of pairwise distances between - the points - -# References -> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the -> Interpretation and Validation of Cluster Analysis*. Computational and -> Applied Mathematics. 20: 53–65. -""" -function silhouettes(assignments::AbstractVector{<:Integer}, - counts::AbstractVector{<:Integer}, - dists::AbstractMatrix{T}) where T<:Real - +# compute silhouette scores for each point given a matrix of cluster-to-point distances +function silhouettes_scores(clu_to_pt::AbstractMatrix{<:Real}, + assignments::AbstractVector{<:Integer}, + clu_sizes::AbstractVector{<:Integer}) n = length(assignments) - k = length(counts) - k >= 2 || throw(ArgumentError("silhouettes() not defined for the degenerated clustering with a single cluster.")) - for j = 1:n - (1 <= assignments[j] <= k) || throw(ArgumentError("Bad assignments[$j]=$(assignments[j]): should be in 1:$k range.")) - end - sum(counts) == n || throw(ArgumentError("Mismatch between assignments ($n) and counts (sum(counts)=$(sum(counts))).")) - size(dists) == (n, n) || throw(DimensionMismatch("The size of a distance matrix ($(size(dists))) doesn't match the length of assignment vector ($n).")) - - # compute average distance from each cluster to each point --> r - r = sil_aggregate_dists(k, assignments, dists) - # from sum to average - @inbounds for j = 1:n - for i = 1:k - c = counts[i] - if i == assignments[j] - c -= 1 - end - if c == 0 - r[i,j] = 0 - else - r[i,j] /= c - end - end - end + @assert size(clu_to_pt) == (length(clu_sizes), n) # compute a and b # a: average distance w.r.t. the assigned cluster # b: the minimum average distance w.r.t. other cluster - a = similar(r, n) - b = similar(r, n) - - for j = 1:n + a = similar(clu_to_pt, n) + b = similar(clu_to_pt, n) + nclusters = length(clu_sizes) + for j in 1:n l = assignments[j] - a[j] = r[l, j] + a[j] = clu_to_pt[l, j] v = typemax(eltype(b)) - @inbounds for i = 1:k - counts[i] == 0 && continue # skip empty clusters - rij = r[i,j] + @inbounds for i = 1:nclusters + clu_sizes[i] == 0 && continue # skip empty clusters + rij = clu_to_pt[i, j] if (i != l) && (rij < v) v = rij end @@ -95,25 +31,80 @@ function silhouettes(assignments::AbstractVector{<:Integer}, # compute silhouette score sil = a # reuse the memory of a for sil for j = 1:n - if counts[assignments[j]] == 1 + if clu_sizes[assignments[j]] == 1 sil[j] = 0 else #If both a[i] and b[i] are equal to 0 or Inf, silhouettes is defined as 0 @inbounds sil[j] = a[j] < b[j] ? 1 - a[j]/b[j] : a[j] > b[j] ? b[j]/a[j] - 1 : - zero(eltype(r)) + zero(eltype(sil)) end end return sil end -silhouettes(R::ClusteringResult, dists::AbstractMatrix) = - silhouettes(assignments(R), counts(R), dists) +# calculate silhouette scores (single batch) +silhouettes_batch(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, + points::Union{AbstractMatrix, Nothing}, + indices::AbstractVector{<:Integer}) = + silhouettes_scores(meandistances(dists, assignments, points, indices), + assignments, cluster_sizes(dists)) + +# batch-calculate silhouette scores (splitting the points into chunks of batch_size size) +function silhouettes(dists::ClusterDistances, + assignments::AbstractVector{<:Integer}, + points::Union{AbstractMatrix, Nothing}, + batch_size::Union{Integer, Nothing} = nothing) + n = length(assignments) + ((batch_size === nothing) || (n <= batch_size)) && + return silhouettes_batch(dists, assignments, points, eachindex(assignments)) -function silhouettes(assignments::AbstractVector{<:Integer}, dists::AbstractMatrix) - counts = fill(0, maximum(assignments)) - for a in assignments - counts[a] += 1 + return mapreduce(vcat, 1:batch_size:n) do batch_start + batch_ixs = batch_start:min(batch_start + batch_size - 1, n) + # copy points/assignments to speed up matrix and indexing operations + silhouettes_batch(dists, assignments[batch_ixs], + points !== nothing ? points[:, batch_ixs] : nothing, + batch_ixs) end - silhouettes(assignments, counts, dists) end + +""" + silhouettes(assignments::Union{AbstractVector, ClusteringResult}, point_dists::Matrix) -> Vector{Float64} + silhouettes(assignments::Union{AbstractVector, ClusteringResult}, points::Matrix; + metric::SemiMetric, [batch_size::Integer]) -> Vector{Float64} + +Compute *silhouette* values for individual points w.r.t. given clustering. + +Returns the ``n``-length vector of silhouette values for each individual point. + +# Arguments + - `assignments::Union{AbstractVector{Int}, ClusteringResult}`: the vector of point assignments + (cluster indices) + - `points::AbstractMatrix`: if metric is nothing it is an ``n×n`` matrix of pairwise distances between the points, + otherwise it is an ``d×n`` matrix of `d` dimensional clustered data points. + - `metric::Union{SemiMetric, Nothing}`: an instance of Distances Metric object or nothing, + indicating the distance metric used for calculating point distances. + - `batch_size::Union{Integer, Nothing}`: if integer is given, calculate silhouettes in batches + of `batch_size` points each, throws `DimensionMismatch` if batched calculation + is not supported by given `metric`. + +# References +> Peter J. Rousseeuw (1987). *Silhouettes: a Graphical Aid to the +> Interpretation and Validation of Cluster Analysis*. Computational and +> Applied Mathematics. 20: 53–65. +> Marco Gaido (2023). Distributed Silhouette Algorithm: Evaluating Clustering on Big Data +""" +function silhouettes(assignments::AbstractVector{<:Integer}, + points::AbstractMatrix; + metric::Union{SemiMetric, Nothing} = nothing, + batch_size::Union{Integer, Nothing} = nothing) + nclusters = maximum(assignments) + nclusters >= 2 || throw(ArgumentError("silhouettes() not defined for the degenerated clustering with a single cluster.")) + check_assignments(assignments, nclusters) + return silhouettes(ClusterDistances(metric, assignments, points, batch_size), + assignments, points, batch_size) +end + +silhouettes(R::ClusteringResult, points::AbstractMatrix; kwargs...) = + silhouettes(assignments(R), points; kwargs...) diff --git a/src/utils.jl b/src/utils.jl index a01db9dc..f65e4914 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -76,3 +76,10 @@ function updatemin!(r::AbstractArray, x::AbstractArray) end return r end + +function check_assignments(assignments::AbstractVector{<:Integer}, nclusters::Union{Integer, Nothing}) + nclu = nclusters === nothing ? maximum(assignments) : nclusters + for (j, c) in enumerate(assignments) + all(1 <= c <= nclu) || throw(ArgumentError("Bad assignments[$j]=$c: should be in 1:$nclu range.")) + end +end \ No newline at end of file diff --git a/test/affprop.jl b/test/affprop.jl index 7e94027f..d51c3382 100644 --- a/test/affprop.jl +++ b/test/affprop.jl @@ -16,9 +16,11 @@ include("test_helpers.jl") @test_throws ArgumentError affinityprop(randn(2, 2), tol=0.0) @test_throws ArgumentError affinityprop(randn(2, 2), damp=-0.1) @test_throws ArgumentError affinityprop(randn(2, 2), damp=1.0) - @test affinityprop(randn(2, 2), damp=0.5, tol=0.5) isa AffinityPropResult + x = randn(2, 4) + S = -pairwise(Euclidean(), x, x, dims=2) + @test affinityprop(S, damp=0.5, tol=0.5) isa AffinityPropResult for disp in keys(Clustering.DisplayLevels) - @test affinityprop(randn(2, 2), tol=0.1, display=disp) isa AffinityPropResult + @test affinityprop(S, tol=0.1, display=disp) isa AffinityPropResult end end diff --git a/test/kmeans.jl b/test/kmeans.jl index 6c4d25e6..ea69d47e 100644 --- a/test/kmeans.jl +++ b/test/kmeans.jl @@ -11,7 +11,7 @@ rng = StableRNG(42) struct MySqEuclidean <: SemiMetric end # redefinition of Distances.pairwise! for MySqEuclidean type -function Distances.pairwise!(r::AbstractMatrix, dist::MySqEuclidean, +function Distances.pairwise!(dist::MySqEuclidean, r::AbstractMatrix, a::AbstractMatrix, b::AbstractMatrix; dims::Integer=2) dims == 2 || throw(ArgumentError("only dims=2 supported for MySqEuclidean distance")) mul!(r, transpose(a), b) diff --git a/test/silhouette.jl b/test/silhouette.jl index f3724c40..643cf779 100644 --- a/test/silhouette.jl +++ b/test/silhouette.jl @@ -11,29 +11,20 @@ local D = [0 1 2 3 @assert size(D) == (4, 4) -local a = [1, 1, 2, 2] -local c = [2, 2] - @testset "Input checks" begin - @test_skip silhouettes(a, [1, 1, 2], D) # should throw because cluster counts are inconsistent - @test_throws ArgumentError silhouettes(a, [3, 2], D) - @test_throws ArgumentError silhouettes([1, 1, 3, 2], [2, 2], D) - @test_throws DimensionMismatch silhouettes([1, 1, 2, 2, 2], [2, 3], D) + @test_throws DimensionMismatch silhouettes([1, 1, 3, 2], D[1:4, 1:3]) + @test_throws DimensionMismatch silhouettes([1, 1, 2, 2, 2], D) + @test_throws Exception silhouettes([1, 1, 2, 2, 2], D, batch_size=3) + D2 = copy(D) + D2[2, 3] = 4 + @test_throws ArgumentError silhouettes([1, 1, 2, 2], D2) end -@test @inferred(silhouettes(a, c, D)) ≈ [1.5/2.5, 0.5/1.5, 0.5/1.5, 1.5/2.5] -@test @inferred(silhouettes(a, c, convert(Matrix{Float32}, D))) isa AbstractVector{Float32} -@test silhouettes(a, D) == silhouettes(a, c, D) # c is optional - -a = [1, 2, 1, 2] -c = [2, 2] - -@test silhouettes(a, c, D) ≈ [0.0, -0.5, -0.5, 0.0] - -a = [1, 1, 1, 2] -c = [3, 1] +@test @inferred(silhouettes([1, 1, 2, 2], D)) ≈ [1.5/2.5, 0.5/1.5, 0.5/1.5, 1.5/2.5] +@test @inferred(silhouettes([1, 1, 2, 2], convert(Matrix{Float32}, D))) isa AbstractVector{Float32} -@test silhouettes(a, c, D) ≈ [0.5, 0.5, -1/3, 0.0] +@test silhouettes([1, 2, 1, 2], D) ≈ [0.0, -0.5, -0.5, 0.0] +@test silhouettes([1, 1, 1, 2], D) ≈ [0.5, 0.5, -1/3, 0.0] @testset "zero cluster distances correctly" begin a = [fill(1, 5); fill(2, 5)] @@ -42,6 +33,7 @@ c = [3, 1] @test silhouettes(a, d) == fill(0.0, 10) d = fill(1, (10, 10)) + for i in 1:10; d[i, i] = 0; end d[1, 2] = d[2, 1] = 5 @test silhouettes(a, d) == [[-0.5, -0.5]; fill(0.0, 8)] @@ -56,9 +48,36 @@ end end @testset "empty clusters handled correctly (#241)" begin - X = rand(MersenneTwister(123), 10, 5) - pd = pairwise(Euclidean(), X, dims=1) - @test all(>=(-0.5), silhouettes([5, 2, 2, 3, 2, 2, 3, 2, 3, 5], pd)) + X = rand(MersenneTwister(123), 3, 10) + pd = pairwise(Euclidean(), X, dims=2) + asgns = [5, 2, 2, 3, 2, 2, 3, 2, 3, 5] + @test all(>=(-0.5), silhouettes(asgns, pd)) + @test all(>=(-0.5), silhouettes(asgns, X, metric=Euclidean())) +end + +@testset "silhouettes(metric=$metric, batch_size=$(batch_size !== nothing ? batch_size : "nothing"))" for + (metric, batch_size, supported) in [ + (Euclidean(), nothing, true), + (Euclidean(), 1000, true), + (Euclidean(), 10, false), + + (SqEuclidean(), nothing, true), + (SqEuclidean(), 1000, true), + (SqEuclidean(), 10, true), + ] + + Random.seed!(123) + X = rand(3, 100) + pd = pairwise(metric, X, dims=2) + a = rand(1:10, size(X, 2)) + kmeans_clu = kmeans(X, 5) + if supported + @test silhouettes(a, X; metric=metric, batch_size=batch_size) ≈ silhouettes(a, pd) + @test silhouettes(kmeans_clu, X; metric=metric, batch_size=batch_size) ≈ silhouettes(kmeans_clu, pd) + else + @test_throws Exception silhouettes(a, X; metric=metric, batch_size=batch_size) + @test_throws Exception silhouettes(kmeans_clu, X; metric=metric, batch_size=batch_size) + end end end