forked from yasminlucero/Kaggle
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathd1.R
39 lines (31 loc) · 1.1 KB
/
d1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
## d1.R
## data processing script
## processes d1
## recode smoking status
## August 25 2012
library(car)
# create state division variate
require(datasets)
state.codes = cbind(state.abb, as.character(state.division))
state.codes = rbind(state.codes, c("DC", "South Atlantic"))
state.codes = rbind(state.codes, c("PR", "South Atlantic"))
state.codes = data.frame(state.codes)
names(state.codes)[2] = "division"
d1$Division = d1$State
i.max = dim(d1)[1]
for(i in 1:i.max){
j = which(as.character(state.codes$state.abb[i]) == d1$State[i])
d1$Division[i] = as.character(state.codes$division[i])
} # end i loop
# create smoker binary variate
tmp = recode(d1$SmokingStatus_NISTCode, "c(5,9)='NA'; c(0,3,4)='FALSE'; c(1,2)='TRUE'")
d1$Smoker = as.logical(tmp)
#
aggregate(Smoker~Gender + Division, d1, mean)
aggregate(Smoker~Gender + Division, d1, length)
d1$age = 2012-d1$YearOfBirth
d1$age[(d1$age>85)]=85 # bin 85+ group to match census
quartz()
par(mar = c(5, 9, 2, 2) + 0.1)
boxplot(age~Division, d1, col=2, horizontal=TRUE, las=1, varwidth=T, xlab="age (yrs)")
write.csv(d1, file="Data/d1.csv", row.names=FALSE)