-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_trait_selection.R
executable file
·41 lines (34 loc) · 2.31 KB
/
2_trait_selection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# This script performs a Boruta trait selection on cleaned table in intermediary_data/1745AJ_Phenotyping_nafixed.csv
# Results are saved unfixed and roughFixed (with or without Tentative decisions)
# The script then performs a leave-one-out cross-validation to see how well the selected traits can predict the treatment class
library('Boruta')
library('randomForest')
set.seed(17022019)
data.nafixed = read.csv("data/intermediary/1745AJ_Phenotyping_nafixed.csv")
# Transform to wide format
data.nafixed = reshape(data.nafixed, idvar='Plant.ID', timevar='DAS', v.names=setdiff(names(data.nafixed), c("Plant.ID", "DAS", "Treatment")), direction='wide', sep="_")
data.nafixed = data.nafixed[-c(1)] # Drop Plant.ID
data.nafixed$Treatment = as.factor(data.nafixed$Treatment)
boruta_result = Boruta(Treatment ~ ., data=data.nafixed, maxRuns=1500, doTrace=2)
trait_importance = attStats(boruta_result)
trait_importance = cbind(trait = rownames(trait_importance), trait_importance)
# Some results from playing with this:
# - order of most important traits varies a bit each time this is run but median trait importance stays quite stable (< 1% difference between runs)
write.csv(trait_importance, "data/results/trait_selection_result.csv", row.names=F)
trait_importance_roughfixed = attStats(TentativeRoughFix(boruta_result))
trait_importance_roughfixed = cbind(trait = rownames(trait_importance_roughfixed), trait_importance_roughfixed)
write.csv(trait_importance_roughfixed, "data/results/trait_selection_result_roughFixed.csv", row.names=F)
# Perform Leave-one-out cross-validation
selected.features = trait_importance_roughfixed[trait_importance_roughfixed$decision == "Confirmed",]$trait
data.nafixed = data.nafixed[,c("Treatment", selected.features)]
write("i,Treatment,Prediction",file="data/results/cv/loocv_traits.csv")
for(i in 1:nrow(data.nafixed)) {
# Split data into training set and test row
data.train = data.nafixed[-i,]
data.train.x = subset(data.train, select = -c(Treatment) )
data.train.y = data.train$Treatment
data.test = data.nafixed[i,]
# Train the model and predict
model = randomForest(data.train.x, data.train.y)
write(paste(i, as.character(data.test$Treatment), as.character(predict(model, newdata = subset(data.test, select = selected.features))), sep=","),file="data/results/cv/loocv_traits.csv",append = T)
}