-
Notifications
You must be signed in to change notification settings - Fork 0
/
ccd_data_prep.R
251 lines (225 loc) · 9.54 KB
/
ccd_data_prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# Set directory.
setwd("...")
# Load packages.
library(dplyr)
library(tidyr)
library(openxlsx)
library(stringr)
# Read the names of the files in the CCD data folder.
master <- list.files("Raw Data\\NCES CCD")
# save folder path for future use.
pth <- "Raw Data\\NCES CCD\\"
#- Membership files (1415 - 1819).
# Create empty data frame to append data.
membership_all <- data.frame()
# Loop for each file.
for (y in master[str_detect(master,"ccd_sch_052")]) {
# Detect file name based on whether file(y) is ZIP or not.
if(str_detect(y, ".zip")){
folder_zip <- as.character(unzip(paste0(pth, y), list = TRUE)$Name)
df_name <- folder_zip[str_detect(folder_zip, "csv|txt")] #select only csv or txt
}else{
df_name <- y
}
# Read data based on df_name has CSV or TXT.
if(str_detect(df_name, "csv")){ #csv files
# if file(y) is ZIP or not.
if(str_detect(y, ".zip")){
data0 <- read.csv(unz(paste0(pth, y), df_name), header = TRUE, sep = ",")
}else{
data0 <- read.csv(paste0(pth, y), header = TRUE, sep = ",")
}
}else{ #txt files
# if file(y) is ZIP or not.
if(str_detect(y, ".zip")){
data0 <- read.delim(unz(paste0(pth, y), df_name))
}else{
data0 <- read.delim(paste0(pth, y))
}
}
# -Export data based on file type WIDE or LONG.
# export data based on whether it is a wide or long file.
if(str_detect(df_name, "_l_")){ #long files
# Filter MA, Grade 12.
df_ccd <- data0 %>%
filter(ST=="MA") %>%
filter(GRADE=="Grade 12")
# Clean state school codes (then, these are the same as DESE codes).
df_ccd$School.Code <- substr(df_ccd$ST_SCHID, nchar(df_ccd$ST_SCHID)-7, nchar(df_ccd$ST_SCHID))
# Create new data frame with std count breakdown.
df_ccd_final <- df_ccd %>%
# select variables
select("SCHOOL_YEAR","School.Code","SCH_NAME","SEX","RACE_ETHNICITY","STUDENT_COUNT") %>%
# make data wider
pivot_wider(names_from = c(RACE_ETHNICITY,SEX), values_from = STUDENT_COUNT) %>%
# extract student counts by group
mutate(Female = rowSums(select(.,ends_with("_Female")), na.rm = T),
Male = rowSums(select(.,ends_with("_Male")), na.rm = T),
`American Indian or Alaska Native` = rowSums(select(.,starts_with("American Indian or Alaska Native")), na.rm = T),
Asian = rowSums(select(.,starts_with("Asian")), na.rm = T),
`Black or African American` = rowSums(select(.,starts_with("Black or African American")), na.rm = T),
`Hispanic/Latino` = rowSums(select(.,starts_with("Hispanic/Latino")), na.rm = T),
`Native Hawaiian or Other Pacific Islander` = rowSums(select(.,starts_with("Native Hawaiian or Other Pacific Islander")), na.rm = T),
`Two or more races` = rowSums(select(.,starts_with("Two or more races")), na.rm = T),
White = rowSums(select(.,starts_with("White")), na.rm = T)) %>%
# rename variables for consistency
rename(Year=SCHOOL_YEAR, G12=`No Category Codes_No Category Codes`) %>%
# select variables to simplify data
select(-(`American Indian or Alaska Native_Female`:White_Male)) %>%
arrange(School.Code)
}else{ #wide files
# define missing values
data0[data0==-1] <- NA
data0[data0==-2] <- NA
data0[data0==-9] <- NA
# Filter MA.
df_ccd <- data0 %>%
filter(STABR=="MA")
# Clean state school codes (then, these are the same as DESE codes).
df_ccd$School.Code <- substr(df_ccd$ST_SCHID, nchar(df_ccd$ST_SCHID)-7, nchar(df_ccd$ST_SCHID))
# Create new data frame with std count breakdown.
df_ccd_final <- df_ccd %>%
# filter data for schools with 12th grade students only.
filter(is.na(G12)==F) %>%
# select variables
select("SURVYEAR","School.Code","SCH_NAME", contains("12")) %>%
# extract student counts by group
mutate(Female = rowSums(select(.,ends_with("F")), na.rm = T),
Male = rowSums(select(.,ends_with("M")), na.rm = T),
`American Indian or Alaska Native` = rowSums(select(.,starts_with("AM")), na.rm = T),
Asian = rowSums(select(.,starts_with("AS")), na.rm = T),
`Black or African American` = rowSums(select(.,starts_with("BL")), na.rm = T),
`Hispanic/Latino` = rowSums(select(.,starts_with("HI")), na.rm = T),
`Native Hawaiian or Other Pacific Islander` = rowSums(select(.,starts_with("HP")), na.rm = T),
`Two or more races` = rowSums(select(.,starts_with("TR")), na.rm = T),
White = rowSums(select(.,starts_with("WH")), na.rm = T)) %>%
# select variables to simplify data
select(-contains("12"),"G12") %>%
# rename variables for consistency
rename(Year=SURVYEAR) %>%
# sort and relocate variables for consistency
arrange(School.Code) %>%
relocate(G12, .after = SCH_NAME)
}
# Append data
membership_all <- rbind(membership_all, df_ccd_final)
}
# Write data.
write.xlsx(membership_all, file="Output Data\\CCD cleaned\\CCD Membership 2015-2019.xlsx", sheetName="CCD Membership", row.names=FALSE)
#- Directory files (1415 - 1819).
# Create empty data frame to append data.
directory_all <- data.frame()
# Loop for each file.
for (y in master[str_detect(master,"ccd_sch_029")]) {
# Detect file name based on whether file(y) is ZIP or not.
if(str_detect(y, ".zip")){
folder_zip <- as.character(unzip(paste0(pth, y), list = TRUE)$Name)
df_name <- folder_zip[str_detect(folder_zip, "csv|txt")] #select only csv or txt
}else{
df_name <- y
}
# Read data based on df_name has CSV or TXT.
if(str_detect(df_name, "csv")){ #csv files
# if file(y) is ZIP or not.
if(str_detect(y, ".zip")){
data0 <- read.csv(unz(paste0(pth, y), df_name), header = TRUE, sep = ",")
}else{
data0 <- read.csv(paste0(pth, y), header = TRUE, sep = ",")
}
}else{ #txt files
# if file(y) is ZIP or not.
if(str_detect(y, ".zip")){
data0 <- read.delim(unz(paste0(pth, y), df_name))
}else{
data0 <- read.delim(paste0(pth, y))
}
}
# rename variables that are different across the years.
if("STABR" %in% names(data0)){
df_ccd <- data0 %>%
rename(ST=STABR, Year=SURVYEAR)
}else{
df_ccd <- data0 %>%
rename(Year=SCHOOL_YEAR)
}
# Filter MA.
df_ccd <- df_ccd %>%
filter(ST=="MA")
# Filter G12OFFERED (detect variable name first, changing across the years).
tt <- df_ccd %>%
select(.,contains("12")) %>%
select(.,ends_with("OFFERED")) %>%
names()
df_ccd <- df_ccd[which(df_ccd[,tt] %in% c("Y","Yes")),]
# Clean state school codes (then, these are the same as DESE codes).
df_ccd$School.Code <- substr(df_ccd$ST_SCHID, nchar(df_ccd$ST_SCHID)-7, nchar(df_ccd$ST_SCHID))
# Select variables.
df_ccd <- df_ccd %>%
select("Year","School.Code","SCH_NAME","SCH_TYPE_TEXT","LEVEL","GSLO","CHARTER_TEXT") %>%
arrange(School.Code)
# Append data
directory_all <- rbind(directory_all, df_ccd)
}
# Rename factor values.
directory_all$SCH_TYPE_TEXT[str_detect(directory_all$SCH_TYPE_TEXT,"Alternative")] <- "Alternative Education School"
directory_all$LEVEL[directory_all$LEVEL==3] <- "High"
directory_all$LEVEL[directory_all$LEVEL==4] <- "Middle"
# Write data.
write.xlsx(directory_all, file="Output Data\\CCD cleaned\\CCD Directory 2015-2019.xlsx", sheetName="CCD Directory", row.names=FALSE)
#- School Characteristics files (1415 - 1819).
# Create empty data frame to append data.
schchar_all <- data.frame()
# Loop for each file.
for (y in master[str_detect(master,"ccd_sch_129")]) {
# Detect file name based on whether file(y) is ZIP or not.
if(str_detect(y, ".zip")){
folder_zip <- as.character(unzip(paste0(pth, y), list = TRUE)$Name)
df_name <- folder_zip[str_detect(folder_zip, "csv|txt")] #select only csv or txt
}else{
df_name <- y
}
# Read data based on df_name has CSV or TXT.
if(str_detect(df_name, "csv")){ #csv files
# if file(y) is ZIP or not.
if(str_detect(y, ".zip")){
data0 <- read.csv(unz(paste0(pth, y), df_name), header = TRUE, sep = ",")
}else{
data0 <- read.csv(paste0(pth, y), header = TRUE, sep = ",")
}
}else{ #txt files
# if file(y) is ZIP or not.
if(str_detect(y, ".zip")){
data0 <- read.delim(unz(paste0(pth, y), df_name))
}else{
data0 <- read.delim(paste0(pth, y))
}
}
# rename variables that are different across the years.
if("STABR" %in% names(data0)){
df_ccd <- data0 %>%
rename(ST=STABR, Year=SURVYEAR)
}else{
df_ccd <- data0 %>%
rename(Year=SCHOOL_YEAR)
}
# Filter MA.
df_ccd <- df_ccd %>%
filter(ST=="MA")
# Clean state school codes (then, these are the same as DESE codes).
df_ccd$School.Code <- substr(df_ccd$ST_SCHID, nchar(df_ccd$ST_SCHID)-7, nchar(df_ccd$ST_SCHID))
# Clean variable names before selecting.
names(df_ccd) <- str_replace(names(df_ccd), "_STATUS", "")
names(df_ccd) <- str_replace(names(df_ccd), "STATUS", "")
# Select variables.
df_ccd <- df_ccd %>%
select("Year","School.Code","SCH_NAME","NSLP_TEXT","TITLEI_TEXT") %>%
arrange(School.Code)
# Append data
schchar_all <- rbind(schchar_all, df_ccd)
}
# Rename factor values.
schchar_all$TITLEI_TEXT[schchar_all$TITLEI_TEXT %in% c("Missing","MISSING","Not reported")] <- NA
schchar_all$NSLP_TEXT[schchar_all$NSLP_TEXT %in% c("Missing","MISSING","Not reported")] <- NA
schchar_all$NSLP_TEXT <- str_replace(schchar_all$NSLP_TEXT, ",", "")
# Write data.
write.xlsx(schchar_all, file="Output Data\\CCD cleaned\\CCD School Characteristics 2015-2019.xlsx", sheetName="CCD School Characteristics", row.names=FALSE)