-
Notifications
You must be signed in to change notification settings - Fork 0
/
for2.R
77 lines (62 loc) · 2.56 KB
/
for2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
setwd("~/uni/statml/project/")
source("setup.R")
library("randomForest")
dbpredgname = db %>%
select(c(imonth, iday, success, suicide, attacktype1_txt,nperps, targtype1_txt,gname, weaptype1_txt,weapsubtype1_txt,nkill, nwound, nkillter))
dbpredgname=dbpredgname[!apply(is.na(dbpredgname), 1, any),]
top.groups.attacks=names(summary(dbpredgname$gname)[2:7])
dbpredgname = dbpredgname %>%filter( gname %in% top.groups.attacks) %>%
mutate(gname=factor(gname))
### !!
for (col in colnames(dbpredgname)) {
v=dbpredgname[,col]
s=summary(dbpredgname[,col])
#if there are to many NAs then convert the variable to a know/unknown factor
if (!is.factor(v)) {
if (mean(is.na(v))>.2) {
dbpredgname[,col]= factor(ifelse(!is.na(v),"known", "unknown"))
}
}
else{
#select all levels the are less than 200 attacks
owfactors=names(summary(dbpredgname[,col])[summary(dbpredgname[,col])<400])
levels(dbpredgname[,col])<- sapply(levels(v),function(x) ifelse(x %in% owfactors,"Other",x))
rm(owfactors)
}
}
min.attacks=min(summary(dbpredgname$gname))
### We need a training and a testing set
# same size for each group and set to avoid fitting for the dominant group (test sample) and wrong mi
# independent
# Problem:attacks very unevenly distributed over the groups
# smallest group determines training sample size
gnprsample <- function() {
dbgntrain<<-dbpredgname %>% filter(F)
dbgntest<<-dbpredgname %>% filter(F)
for (gn in levels(dbpredgname$gname)) {
dbfil=dbpredgname[dbpredgname$gname==gn,]
n=floor(min.attacks/2)
trainidx=sample(1:nrow(dbfil),size=n)
dbgntrain<<-rbind(dbgntrain, dbfil[trainidx,])
dbfil=dbfil[-trainidx,]
testidx=sample(1:nrow(dbfil),size=n)
dbgntest<<-rbind(dbgntest, dbfil[testidx,])
}
rm(gn,dbfil,n,trainidx,testidx)
}
rungnforest <- function() {
gnprsample()
#model <- randomForest(y=dbgntrain$gname,x=dbgntrain,xtest=dbgntest,ytest=dbgntest$gname ,na.action = na.omit)
forest <- randomForest(gname~., data=dbgntrain, ntree=100, nodesize=10, importance=T, na.action = na.omit)
summary(forest)
#rowSums(r$confusion)
plot(forest)
dbgntest==predict(r, dbgntest)
forest.pred=predict(forest, dbgntest)
s2=summary(gntree)
s1=summary(gntree.pruned)
mcm=missclassmat(dbgntest$gname,gntree.pred)
#savedres<-data.frame(method="glm", mcm=I(list(mcm)), mcerror=mcmerror(mcm), parameters=I(list(gntree$call)) , summary1=I(list(s1)), summary2=I(list(s2)) ) %>% filter(F)
savegnres(data.frame(method="tree", mcm=I(list(mcm)), mcerror=mcmerror(mcm), parameters=I(list(gntree$call)) , summary1=I(list(s1)), summary2=I(list(s2)) ))
}
rungnforest()