-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyse_real_dataset.R
139 lines (100 loc) · 5.62 KB
/
analyse_real_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#### Classification of real dataset preprocessed by processRealData.R
# We compare results with SS, OUR3, L2 and weight L2
library(reticulate)
library(fda)
library(scatterplot3d)
library(igraph)
library(ROCR)
library(caret)
library(vows)
# this may need to be set at the beginning of the session
# Susan's
# use_python('/usr/bin/python',required=TRUE)
# use_python('/usr/local/bin/python3.7',required=TRUE)
# use_python('/Users/suswei/anaconda3/bin/python',required=TRUE)
# Marie's
# use_python('/anaconda3/envs/r-reticulate/bin/python',required=TRUE)
# use_python('/Users/UQAM/anaconda3/bin/python',required=TRUE)
source('robust_isomap.R')
source('pairwise_geo_estimation.R')
source("weight_L2.R")
source("processRealData.R")
# call processRealData which each element is a list containing "discrete_data" , "reg_grid" "true_group"
dataset = processRealData()
meth <- list("NN" = FALSE,"RD_o" = FALSE,"RD" = FALSE,"SS_o" = FALSE,"SS" = TRUE,"pI" = FALSE,"OUR" = FALSE,"OUR2" = FALSE,"OUR3"=TRUE,"RP" = FALSE , "L2"=TRUE, "w_L2"= TRUE)
AUC_full_res<- list()
err_classification<- list()
for(ind_data in 1:length(dataset)){
# estimation of geo distance with methods TRUE in meth
geo_estim = pairwise_geo_estimation(method=meth,
true_data=NA,
discrete_data=t(dataset[[ind_data]]$data),
true_geo=NA,
plot_true=FALSE,
nb_proj=NA,
grid=matrix(rep(dataset[[ind_data]]$reg_grid,length(dataset[[ind_data]]$reg_grid)),nrow=length(dataset[[ind_data]]$reg_grid),byrow = TRUE),
reg_grid=dataset[[ind_data]]$reg_grid,
K_dense =100,
common_grid_true=TRUE,
Analytic_geo_available=FALSE,
is_data_smoothed=TRUE)
# Assess classificaton performance
nr_train_test_splits = 500
# Create balanced train-test partition taking into account the proportion of binary labels in the data
# train.rows[,i] contains the training indices i-th split
train.rows = createDataPartition(dataset[[ind_data]]$true_group, times = nr_train_test_splits, p = 0.53, list = FALSE)
temp<- lapply(geo_estim,mean_auc_err_class,train.rows=train.rows,true_group=dataset[[ind_data]]$true_group,h=2)
temp2<- matrix(unlist(temp,use.names=FALSE),ncol=length(geo_estim))
AUC_res <- temp2[1:nr_train_test_splits,]
boxplot(AUC_res,names=names(geo_estim),main=dataset[[ind_data]]$name,las=2,ylab="AUC")
AUC_full_res[[dataset[[ind_data]]$name]]<-AUC_res
err_class_res<- temp2[(nr_train_test_splits+1):(2*nr_train_test_splits),]
boxplot(err_class_res,names=names(geo_estim),main=dataset[[ind_data]]$name,las=2,ylab="err_class")
err_classification[[dataset[[ind_data]]$name]]<-err_class_res
}
##### Warning hard coding
pdf("classification_results_grotwh_curve1.pdf",width=12,height=5)
par(mfrow=c(2,3))
for(i in 1:6){
boxplot(AUC_full_res[[i]],las=2,ylab="AUC",names=c(substr(names(geo_estim)[1:4], 11, 17),substr(names(geo_estim)[5:6], 7, 10)),main=names(AUC_full_res)[i])
boxplot(err_classification[[i]],las=2,ylab="err_class",names=c(substr(names(geo_estim)[1:4], 11, 17),substr(names(geo_estim)[5:6], 7, 10)),main=names(err_classification)[i])
}
dev.off()
pdf("classification_results_grotwh_curve2.pdf",width=12,height=5)
par(mfrow=c(2,3))
for(i in 7:12){
boxplot(AUC_full_res[[i]],las=2,ylab="AUC",names=c(substr(names(geo_estim)[1:4], 11, 17),substr(names(geo_estim)[5:6], 7, 10)),main=names(AUC_full_res)[i])
boxplot(err_classification[[i]],las=2,ylab="err_class",names=c(substr(names(geo_estim)[1:4], 11, 17),substr(names(geo_estim)[5:6], 7, 10)),main=names(err_classification)[i])
}
dev.off()
pdf("classification_results_tecator.pdf",width=12,height=5)
par(mfrow=c(2,3))
for(i in 13:16){
boxplot(AUC_full_res[[i]],las=2,ylab="AUC",names=c(substr(names(geo_estim)[1:4], 11, 17),substr(names(geo_estim)[5:6], 7, 10)),main=names(AUC_full_res)[i])
boxplot(err_classification[[i]],las=2,ylab="err_class",names=c(substr(names(geo_estim)[1:4], 11, 17),substr(names(geo_estim)[5:6], 7, 10)),main=names(err_classification)[i])
}
dev.off()
mean_auc_err_class <- function(train.rows,pairwise_distance_estimate,true_group,h){
# train.rows created by createDataPartition
# pairwise_distance_matrix samplesize by samplesize estimates of distance
# true_group vector of length samplesize of binary class label
nr_train_test_splits = dim(train.rows)[2]
indices = 1:length(true_group)
group1_indices = indices[true_group]
aucs<-rep(0,nr_train_test_splits)
class_err<-rep(0,nr_train_test_splits)
for(split in 1:nr_train_test_splits){
train_ind = train.rows[,split]
# We classify the data using functional version of the Nadaraya–Watson kernel estimator for the class membership probabilities
deno <- colSums(dnorm(h^(-1)*pairwise_distance_estimate[train_ind,-train_ind]))
class1_count_hat<- colSums(dnorm(h^(-1)*pairwise_distance_estimate[train_ind[is.element(train_ind,group1_indices)],-train_ind]))
class1_prob_hat<- class1_count_hat / deno
predobj <- prediction(class1_prob_hat, true_group[-train_ind]*1)
perf1 <- performance(predobj,"auc")
aucs[split] = [email protected][[1]]
pred_thresh_05<- rep(0,length(true_group)-length(train_ind))
pred_thresh_05[class1_prob_hat>0.5] <- 1
class_err[split] <- sum(abs(pred_thresh_05-true_group[-train_ind]) )/length(pred_thresh_05)
}
return(list("aucs"=aucs,"class_err"=class_err))
}