forked from SarahU3/data-science-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SourceFile.R
69 lines (58 loc) · 2.54 KB
/
SourceFile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
### Source File - Marriage/Divorce Project
setwd("/Users/Sarah/Documents/Github/data-science-project/")
library(foreign)
### downloading GSS data, remove "#" to run code if necessary
#temp <- tempfile()
#download.file("http://gss.norc.org/documents/spss/GSS_spss.zip",temp, mode="wb")
#unzipped <- unzip(temp, exdir=getwd())
#filename <- grep("sav",unzipped)
#GSS.complete <- read.spss("C:/Users/Sarah/Documents/GSS_spss/GSS7214_R6b.sav", to.data.frame = TRUE)
# clear memory - suggestion courtesy of GitHub user ajdamico
#z <- GSS.complete
#rm(GSS.complete )
#gc()
# repeat
#GSS.complete <- z
#rm( z )
#gc()
#save as rda
#save(GSS.complete , file = "GSScomplete.rda")
## load complete data set, if necessary
load("C:/Users/Sarah/Documents/GitHub/data-science-project/GSScomplete.rda")
str(GSS.complete)
head(GSS.complete)
## variable names
## survey-related variables:
# "oversamp" weights for black oversamples
# "formwt" weight to deal with experimental randomization
# "wtssall" weight variable
# "sampcode" sampling error code
# "sample" sampling frame and method
# id
# ballot
# year
var_list <- c("oversamp", "formwt", "wtssall", "sampcode", "sample", "id", "ballot",
"year", "sexeduc", "region", "premarsx", "xmarsex", "pornlaw", "agewed",
"marital", "partyid", "hapmar", "denom", "fund", "childs", "degree",
"age", "divorce", "widowed", "educ", "natdrug", "wrkstat", "spwrksta",
"polviews", "happy", "trust", "class", "income", "version", "reg16",
"family16", "famdif16", "born", "parborn", "income91", "income98", "size", "attend",
"relig16", "bible", "helpful", "fair", "consci", "satjob", "satfin",
"abnomore", "absingle", "divlaw", "xmovie", "fefam")
GSS.divorce <- GSS.complete[GSS.complete$year>=1996,var_list]
GSS.divorce$Agecat1<-cut(GSS.divorce$age, c(0,18,25,35,45,55,65,75,85,95))
GSS.divorce <- GSS.divorce[GSS.divorce$marital=="MARRIED" | GSS.divorce$marital=="DIVORCED" | GSS.divorce$marital=="SEPARATED", ]
GSS.divorce$marital <- factor(GSS.divorce$marital)
GSS.divorce$marital2 <- "married"
GSS.divorce$marital2 [GSS.divorce$marital == "SEPARATED"] <- "split"
GSS.divorce$marital2 [GSS.divorce$marital == "DIVORCED"] <- "split"
GSS.divorce$marital2 [is.na(GSS.divorce$marital) ==T] <- NA
GSS.divorce$marital2 <- factor(GSS.divorce$marital2)
GSSnew <- GSS.divorce[!is.na(GSS.divorce$marital2),]
summary(GSSnew$marital2)
# to free up RAM, remove the full r data frame
rm( GSS.complete )
# garbage collection: clear up RAM
gc()
# save GSS.divorce
save(GSS.divorce , file = "GSSdivorce.rda")