From 0161f888df2de8c877f906f01c9d9f54955c17d1 Mon Sep 17 00:00:00 2001 From: Or Dinari Date: Sun, 22 Aug 2021 20:04:40 +0300 Subject: [PATCH] fmeasure --- src/Clustering.jl | 6 +++- src/fmeasure.jl | 83 +++++++++++++++++++++++++++++++++++++++++++++++ test/fmeasure.jl | 20 ++++++++++++ test/runtests.jl | 3 +- 4 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 src/fmeasure.jl create mode 100644 test/fmeasure.jl diff --git a/src/Clustering.jl b/src/Clustering.jl index 39e42783..a09d538e 100644 --- a/src/Clustering.jl +++ b/src/Clustering.jl @@ -64,7 +64,10 @@ module Clustering Hclust, hclust, cutree, # MCL - mcl, MCLResult + mcl, MCLResult, + + #fmeasure + pair_precision, fmeasure, pair_recall ## source files @@ -84,6 +87,7 @@ module Clustering include("varinfo.jl") include("vmeasure.jl") include("mutualinfo.jl") + include("fmeasure.jl") include("hclust.jl") diff --git a/src/fmeasure.jl b/src/fmeasure.jl new file mode 100644 index 00000000..9c0d4f0b --- /dev/null +++ b/src/fmeasure.jl @@ -0,0 +1,83 @@ +""" + _pair_confusion_matrix(a,b) -> NTuple{4, Int64} + +Compute the similarities between two clusterings by considering all pairs of samples. + +Returns (TruePositive,FalsePositive,FalseNegative,TrueNegative) + +""" +function _pair_confusion_matrix(a,b) + c = counts(a, b) + n = length(a) + n_k = sum(c,dims=1)[:] + n_c = sum(c,dims=2)[:] + n_sum = sum(c.*c) + tp = n_sum-n + fp = sum(c*n_k)-n_sum + fn = sum(c'*n_c)-n_sum + tn = n^2-fp-fn-n_sum + return tp,fp,fn,tn +end + + + +""" + pair_precision(a, b) -> Float64 + +Compute the pair counting precision between two clustering of the same data points. + +`a` and `b` can be either [`ClusteringResult`](@ref) instances or +assignments vectors (`AbstractVector{<:Integer}`). + +Returns the value of the pair counting precision. + +# References +> Pfitzner, Darius, Richard Leibbrandt, and David Powers. (2009). +> *Characterization and evaluation of similarity measures for pairs of clusterings.* +> Knowledge and Information Systems: 361-394. +""" +function pair_precision(a, b) + tp,fp,fn,tn = _pair_confusion_matrix(a,b) + return tp/(tp+fp) +end + +""" + pair_recall(a, b) -> Float64 + +Compute the pair counting recall between two clustering of the same data points. + +`a` and `b` can be either [`ClusteringResult`](@ref) instances or +assignments vectors (`AbstractVector{<:Integer}`). + +Returns the value of the pair counting recall. + +# References +> Pfitzner, Darius, Richard Leibbrandt, and David Powers. (2009). +> *Characterization and evaluation of similarity measures for pairs of clusterings.* +> Knowledge and Information Systems: 361-394. +""" +function pair_recall(a, b) + tp,fp,fn,tn = _pair_confusion_matrix(a,b) + return tp/(tp+fn) +end + +""" + fmeasure(a, b) -> Float64 + +Compute the pair counting fmeasure between two clustering of the same data points. + +`a` and `b` can be either [`ClusteringResult`](@ref) instances or +assignments vectors (`AbstractVector{<:Integer}`). + +Returns the value of the pair counting recall. + +# References +> Pfitzner, Darius, Richard Leibbrandt, and David Powers. (2009). +> *Characterization and evaluation of similarity measures for pairs of clusterings.* +> Knowledge and Information Systems: 361-394. +""" +function fmeasure(a, b) + p = pair_precision(a,b) + r = pair_recall(a,b) + return (2*p*r)/(p+r) +end \ No newline at end of file diff --git a/test/fmeasure.jl b/test/fmeasure.jl new file mode 100644 index 00000000..2757261d --- /dev/null +++ b/test/fmeasure.jl @@ -0,0 +1,20 @@ +using Test +using Clustering + +@testset "fmeasure()" begin + + + a1 = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3] + a2 = [1, 1, 1, 1, 1, 2, 3, 3, 1, 2, 2, 2, 2, 2, 3, 3, 3] + @test fmeasure(a1, a2) ≈ 0.47 atol=1.0e-2 + @test pair_precision(a1, a2) ≈ 0.5 atol=1.0e-2 + @test pair_recall(a1, a2) ≈ 0.45 atol=1.0e-2 + + + a1 = [1, 1, 1, 1, 1, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2] + a2 = [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4] + @test fmeasure(a1, a2) ≈ 0.529 atol=1.0e-2 + @test pair_precision(a1, a2) ≈ 0.6 atol=1.0e-2 + @test pair_recall(a1, a2) ≈ 0.47 atol=1.0e-2 + +end diff --git a/test/runtests.jl b/test/runtests.jl index 1f9d483a..ab7e35a6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,7 +19,8 @@ tests = ["seeding", "hclust", "mcl", "vmeasure", - "mutualinfo"] + "mutualinfo", + "fmeasure"] println("Runing tests:") for t in tests