-
Notifications
You must be signed in to change notification settings - Fork 2
/
evalcluster.R
67 lines (60 loc) · 2.09 KB
/
evalcluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#' evalcluster
#'
#' Three criteria (NMI, RI, ARI) to evaluate clustering performance.
#' @param truelabel A numeric vector of true labels of each sample.
#' @param predlabel A numeric vector of predicted labels of each sample.
#' @return NMI Value of normalized mutual information.
#' @return RI Value of rand index.
#' @return ARI Value of adjusted rand index.
#' @keywords clustering, validation
#' @export
#' @author Validation methods implemented by Yunpei Xu, [email protected], Central South University.
#' @examples
#' truelabel = sample(1:3, size=10, replace=TRUE)
#' predlabel = sample(1:3, size=10, replace=TRUE)
#' evalcluster(truelabel,predlabel)
evalcluster<-function(truelabel,predlabel){
if(length(truelabel)!=length(predlabel))
stop("truelabel and predlabel must have the same length")
total = length(truelabel)
x_ids = unique(truelabel)
y_ids = unique(predlabel)
#Mutual information
MI = 0.0
for(idx in x_ids){
for(idy in y_ids){
idxOccur = which(truelabel==idx)
idyOccur = which(predlabel==idy)
idxyOccur = intersect(idxOccur,idyOccur)
if(length(idxyOccur)>0){
MI = MI + (length(idxyOccur)/total)*log2((length(idxyOccur)*total)/(length(idxOccur)*length(idyOccur)));
}
}
}
#Normalized Mutual information
Hx = 0; #Entropies
for(idx in x_ids){
idxOccurCount = length(which(truelabel==idx));
Hx = Hx - (idxOccurCount/total) * log2(idxOccurCount/total);
}
Hy = 0;#Entropies
for(idy in y_ids){
idyOccurCount = length(which(predlabel==idy));
Hy = Hy - (idyOccurCount/total) * log2(idyOccurCount/total);
}
nmi = 2 * MI / (Hx+Hy)
#(adjusted) Rand Index
tab = table(truelabel,predlabel)
conv_df = as.data.frame.matrix(tab)
n <- sum(tab)
ni <- apply(tab, 1, sum)
nj <- apply(tab, 2, sum)
n2 <- choose(n, 2)
nis2 <- sum(choose(ni[ni > 1], 2))
njs2 <- sum(choose(nj[nj > 1], 2))
ri = 1 + (sum(tab^2) - (sum(ni^2) + sum(nj^2))/2)/n2
ari=c(sum(choose(tab[tab > 1], 2)) - (nis2 * njs2)/n2)/((nis2 + njs2)/2 - (nis2 * njs2)/n2)
out = c(nmi,ri,ari)
names(out)=c("NMI","RI","ARI")
return(out)
}