Skip to content

Data Preparation 2015

Adam VanIwaarden edited this page Jan 31, 2017 · 3 revisions

Data Preparation of 2015 SBAC Data

Student level data preparation

In the source code below, the data supplied by the Hawaii Department of Education is a pipe delimited file named Hawaii_Data_LONG_2015.txt and is located in the relative folder Data/Base_Files. Following the text data being read into R, the data is cleaned up variable by variable to ensure that it matches previously established data naming conventions.

###########################################################################
###
### R Syntax for construction of 2015 Hawaii LONG data file
###
###########################################################################

### Load SGP Package

require(SGP)
require(data.table)

### Load tab delimited data

Hawaii_Data_LONG_2015 <- fread("Data/Base_Files/Hawaii_Data_LONG_2015.txt", colClasses=rep("character", 34))


### Tidy up data

setnames(Hawaii_Data_LONG_2015, c("Valid_Case", "year", "grade", "lastName", "firstName", "EMH Level", "ELL Status", "Complex Area"),
	c("VALID_CASE", "Year", "Gr", "LName", "FName", "EMH.Level", "ELL_STATUS_MULTILEVEL", "Complex.Area"))
Hawaii_Data_LONG_2015[,VALID_CASE:="VALID_CASE"]
Hawaii_Data_LONG_2015[,Gr:=as.character(as.numeric(Gr))]
Hawaii_Data_LONG_2015[Gr %in% c("1", "2", "9", "10", "91"), VALID_CASE:="INVALID_CASE"]
Hawaii_Data_LONG_2015[,DOE_Ethnic:=as.character(DOE_Ethnic)]
Hawaii_Data_LONG_2015[,Fed7_Ethnic:=as.factor(Fed7_Ethnic)]
Hawaii_Data_LONG_2015[,Fed5_Ethnic:=as.factor(Fed5_Ethnic)]
Hawaii_Data_LONG_2015[,Disadv:=as.factor(Disadv)]
Hawaii_Data_LONG_2015[,ELL:=as.factor(ELL)]
Hawaii_Data_LONG_2015[,SpEd:=as.factor(SpEd)]
Hawaii_Data_LONG_2015[,Migrant:=as.factor(Hawaii_Data_LONG_2015$Migrant)]
Hawaii_Data_LONG_2015[,Scale_Score:=as.numeric(Scale_Score)]
Hawaii_Data_LONG_2015[,FSY:=as.factor(FSY)]
Hawaii_Data_LONG_2015[,ETHNICITY:=as.character(Fed7_Ethnic)]
Hawaii_Data_LONG_2015[DOE_Ethnic %in% c("Native Hawaiian", "Part-Hawaiian"), ETHNICITY:="Native Hawaiian"]
Hawaii_Data_LONG_2015[,ETHNICITY:=as.factor(Hawaii_Data_LONG_2015$ETHNICITY)]
levels(Hawaii_Data_LONG_2015$ETHNICITY)[c(3,4)] <- c("Black or African American", "Hispanic or Latino")
Hawaii_Data_LONG_2015[District=="Charter", District:="Charter Schools"]
Hawaii_Data_LONG_2015[,Complex:=as.factor(Hawaii_Data_LONG_2015$Complex)]
levels(Hawaii_Data_LONG_2015$Complex) <- as.vector(sapply(levels(Hawaii_Data_LONG_2015$Complex), capwords))
levels(Hawaii_Data_LONG_2015$Complex)[c(21,24,38)] <- paste(levels(Hawaii_Data_LONG_2015$Complex)[c(21,24,38)], "Complex")
levels(Hawaii_Data_LONG_2015$Complex) <- as.vector(sapply(sapply(strsplit(sapply(levels(Hawaii_Data_LONG_2015$Complex), capwords), " "), head, -1), paste, collapse=" "))
levels(Hawaii_Data_LONG_2015$Complex)[29] <- "McKinley"
Hawaii_Data_LONG_2015[,Complex.Area:=as.factor(Complex.Area)]
levels(Hawaii_Data_LONG_2015$Complex.Area)[8] <- "Hilo-Laupahoehoe-Waiakea"
Hawaii_Data_LONG_2015[,Sex:=as.factor(Sex)]
Hawaii_Data_LONG_2015[,ELL_STATUS_MULTILEVEL:=as.factor(ELL_STATUS_MULTILEVEL)]
Hawaii_Data_LONG_2015[,School_Admin_Rollup:=as.factor(School_Admin_Rollup)]
Hawaii_Data_LONG_2015[,District:=as.factor(District)]
Hawaii_Data_LONG_2015[,STATE_ENROLLMENT_STATUS:=as.factor(STATE_ENROLLMENT_STATUS)]
Hawaii_Data_LONG_2015[,SCHOOL_ENROLLMENT_STATUS:=as.factor(SCHOOL_ENROLLMENT_STATUS)]
Hawaii_Data_LONG_2015[,DISTRICT_ENROLLMENT_STATUS:=as.factor(DISTRICT_ENROLLMENT_STATUS)]
Hawaii_Data_LONG_2015[,COMPLEX_ENROLLMENT_STATUS:=as.factor(COMPLEX_ENROLLMENT_STATUS)]
Hawaii_Data_LONG_2015[,COMPLEX_AREA_ENROLLMENT_STATUS:=as.factor(COMPLEX_AREA_ENROLLMENT_STATUS)]
Hawaii_Data_LONG_2015[,FSY_SchCode:=as.integer(FSY_SchCode)]

Hawaii_Data_LONG_2015[,HIGH_NEED_STATUS_DEMOGRAPHIC:=
    factor(2, levels=1:2, labels=c("High Need Status: ELL, Special Education, or Disadvantaged Student", "High Need Status: Non-ELL, Non-Special Education, and Non-Disadvantaged Student"))]
Hawaii_Data_LONG_2015$HIGH_NEED_STATUS_DEMOGRAPHIC[Hawaii_Data_LONG_2015$Disadv=="Disadvantaged: Yes" | Hawaii_Data_LONG_2015$ELL=="ELL Status: Yes" | Hawaii_Data_LONG_2015$SpEd=="Special Education: Yes"] <- "High Need Status: ELL, Special Education, or Disadvantaged Student"

Hawaii_Data_LONG_2015[,SCHOOL_FSY_ENROLLMENT_STATUS:=factor(2, levels=1:2, labels=c("Enrolled School: No", "Enrolled School: Yes"))]
Hawaii_Data_LONG_2015$SCHOOL_FSY_ENROLLMENT_STATUS[Hawaii_Data_LONG_2015$SCHOOL_ENROLLMENT_STATUS=="Enrolled School: No" | Hawaii_Data_LONG_2015$FSY=="Full School Year Status: No"] <- "Enrolled School: No"

### Reorder variables

my.variable.order <- c(
	"VALID_CASE", "Domain", "Year", "Gr", "IDNO", "LName", "FName", "SCode_Admin_Rollup", "School_Admin_Rollup", "FSY_SchCode", "EMH.Level", "DCode", "District", "CCode", "Complex",
	"CACode", "Complex.Area", "Sex", "ETHNICITY", "HIGH_NEED_STATUS_DEMOGRAPHIC", "DOE_Ethnic", "Fed7_Ethnic", "Fed5_Ethnic", "Disadv", "ELL", "ELL_STATUS_MULTILEVEL", "SpEd",
	"Migrant", "Scale_Score", "Proficiency_Level", "FSY", "SCHOOL_ENROLLMENT_STATUS", "DISTRICT_ENROLLMENT_STATUS", "COMPLEX_ENROLLMENT_STATUS", "COMPLEX_AREA_ENROLLMENT_STATUS",
	"STATE_ENROLLMENT_STATUS", "SCHOOL_FSY_ENROLLMENT_STATUS")
setcolorder(Hawaii_Data_LONG_2015, my.variable.order)


### Save results

save(Hawaii_Data_LONG_2015, file="Data/Hawaii_Data_LONG_2015.Rdata")

Note that the naming conventions associated with Hawaii data provided by the DOE and the naming conventions used by the SGP Package are different. The SGP Package accommodates different state data naming conventions through the use of a meta-data lookup table embedded within the SGPstateData Rdata object in the package. The variable name lookup table for Hawaii is shown below with names.provided giving the state specific names, names.sgp the SGP Package specific names, names.type giving the type associated with the variable which is used within summarizeSGP to construct group summaries, names.info providing meta-data associated with the variable, and names.output a Boolean indicator of whether variable should be used with summarizeSGP.

names.provided names.sgp names.type names.info names.output
Year YEAR time Year (testing year) associated with record
IDNO ID individual Unique student identifier
LName LAST_NAME label Last name of student
FName FIRST_NAME label First name of student
Gr GRADE institution_level Grade level of test taken
Domain CONTENT_AREA content Content area (Mathematics or Reading)
Scale_Score SCALE_SCORE measure Student scale score
Proficiency_Level ACHIEVEMENT_LEVEL measure Achievement level associated with student score
SCode_Admin_Rollup SCHOOL_NUMBER institution School number rolled up for administration purposes possibly containing multiple SCHOOL_NUMBERs
School_Admin_Rollup SCHOOL_NAME label School name rolled up for administration purposes possibly containing multiple SCHOOL_NUMBERs
FSY_SchCode SCHOOL_NUMBER_FSY institution School number rolled up for FSY reporting
GradeRange GRADE_RANGE institution_type Grade range of school
SchlType SCHOOL_TYPE institution_type Type of School (Charter/DOE/Special)
EMH.Level EMH_LEVEL institution_type Elementary/Middle/High School designation
Type SCHOOL_LEVEL institution_type Elementary/Elementary-Middle/Middle/Middle-High/High School designation
DCode DISTRICT_NUMBER institution District number
District DISTRICT_NAME label District name
CCode COMPLEX_NUMBER institution Complex number
Complex COMPLEX_NAME label Complex name
CACode COMPLEX_AREA_NUMBER institution Complex area number
Complex.Area COMPLEX_AREA_NAME label Complex area name
ETHNICITY ETHNICITY demographic Student ethnicity combining FED7 and DOE for Native Hawaiian TRUE
DOE_Ethnic ETHNICITY_DOE demographic Student ethnicity using DOE categories TRUE
Fed5_Ethnic ETHNICITY_FED5 demographic Student ethnicity using FED 5 categories FALSE
Fed7_Ethnic ETHNICITY_FED7 demographic Student ethnicity using FED 7 categories FALSE
Disadv DISADVANTAGED_STATUS demographic Disadvantaged status TRUE
ELL ELL_STATUS demographic Student ELL status indicator TRUE
ELL_STATUS_MULTILEVEL ELL_STATUS_MULTILEVEL demographic Student ELL multi-level status indicator TRUE
SpEd SPECIAL_EDUCATION_STATUS demographic Student special education status indicator TRUE
Sex GENDER demographic Student gender TRUE
Source TEST_ADMINISTRATION type of test HSA/HSAA/HLIP/HAPA/Linapuni
Migrant MIGRANT_STATUS demographic Student migrant status TRUE
HIGH_NEED_STATUS HIGH_NEED_STATUS demographic High need status flag TRUE
HIGH_NEED_STATUS_DEMOGRAPHIC HIGH_NEED_STATUS_DEMOGRAPHIC demographic High need status flag indicating ELL or SPED or DISADVANTAGED TRUE
FSY FULL_SCHOOL_YEAR_STATUS demographic Full school year status indicator TRUE
SCHOOL_ENROLLMENT_STATUS SCHOOL_ENROLLMENT_STATUS institution_inclusion School inclusion/accountability indicator
SCHOOL_FSY_ENROLLMENT_STATUS SCHOOL_FSY_ENROLLMENT_STATUS institution_inclusion School inclusion/accountability indicator
DISTRICT_ENROLLMENT_STATUS DISTRICT_ENROLLMENT_STATUS institution_inclusion District inclusion/accountability indicator
COMPLEX_ENROLLMENT_STATUS COMPLEX_ENROLLMENT_STATUS institution_inclusion Complex inclusion/accountability indicator
COMPLEX_AREA_ENROLLMENT_STATUS COMPLEX_AREA_ENROLLMENT_STATUS institution_inclusion Complex Area inclusion/accountability indicator
STATE_ENROLLMENT_STATUS STATE_ENROLLMENT_STATUS institution_inclusion State inclusion/accountability indicator
VALID_CASE VALID_CASE individual inclusion Valid case indicator

Student-Instructor link data preparation

In addition to preparation the student level file for analysis, Hawaii also utilizes the SGP package SGP Package for the summarization of instructor level data. Utilization of this requires a student-instructor lookup file so that appropriate summary level variables can be created. In the source code below, student-instructor data provided by the Hawaii Department of Education is read into R and cleaned for use in SGP analyses.

################################################################################
###
### R Syntax for for the construction of the 2015 student-instructor lookup table
###
################################################################################

### Load SGP Package

require(SGP)
require(data.table)


###########################################################################
###
### Create teacher linkage file
###
###########################################################################

### Load data

Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER <- fread("Data/Base_Files/BFK_Cleaned_Spring_2015.txt", colClasses=rep("character", 13))


### Remove duplicates

setkeyv(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER, names(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER))
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER <- Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[!duplicated(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER)]


### Extract relevant variables

variables.to.use <- c("StaffUniqueID", "StaffLastName", "StaffFirstName", "SchoolName", "SchoolCode_[NEW]", "SubjectName", "StateStudentID", "rounded_terms")
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER <- Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,variables.to.use, with=FALSE]


### Tidy up data

setnames(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER,
	names(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER),
	c("INSTRUCTOR_NUMBER", "INSTRUCTOR_LAST_NAME", "INSTRUCTOR_FIRST_NAME", "SCHOOL_NAME_INSTRUCTOR", "SCHOOL_NUMBER_INSTRUCTOR", "CONTENT_AREA", "ID",  "TERMS"))
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER$YEAR <- "2015"
setcolorder(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER,
	c("ID","CONTENT_AREA", "YEAR", "INSTRUCTOR_NUMBER", "INSTRUCTOR_LAST_NAME", "INSTRUCTOR_FIRST_NAME", "SCHOOL_NUMBER_INSTRUCTOR", "SCHOOL_NAME_INSTRUCTOR", "TERMS"))

Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[CONTENT_AREA=="Math", CONTENT_AREA := "MATHEMATICS"]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[CONTENT_AREA=="ELA", CONTENT_AREA := "READING"]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[, INSTRUCTOR_ENROLLMENT_STATUS := factor(1, levels=0:1, labels=c("Enrolled Instructor: No", "Enrolled Instructor: Yes"))]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,TERMS:=as.numeric(TERMS)]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,INSTRUCTOR_LAST_NAME:=as.factor(INSTRUCTOR_LAST_NAME)]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,INSTRUCTOR_FIRST_NAME:=as.factor(INSTRUCTOR_FIRST_NAME)]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,SCHOOL_NUMBER_INSTRUCTOR := as.integer(SCHOOL_NUMBER_INSTRUCTOR)]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,SCHOOL_NAME_INSTRUCTOR:=as.factor(SCHOOL_NAME_INSTRUCTOR)]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,VALID_CASE:="VALID_CASE"]

### Create TERMS variable from sum of TERMS

Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER <- Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,TERMS:=sum(TERMS, na.rm=TRUE), keyby=list(ID, CONTENT_AREA, INSTRUCTOR_NUMBER)]
Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER <- Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[!duplicated(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER)]


### Create Weight Variable

Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,INSTRUCTOR_WEIGHT:=round(TERMS/sum(TERMS, na.rm=TRUE), digits=2), by=list(ID, CONTENT_AREA)]


### NULL out extraneous variables

Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER[,TERMS:=NULL]


### Set column order

tmp.column.order <- c('ID', 'CONTENT_AREA', 'YEAR', 'INSTRUCTOR_NUMBER', 'INSTRUCTOR_LAST_NAME', 'INSTRUCTOR_FIRST_NAME',
	'SCHOOL_NUMBER_INSTRUCTOR', 'SCHOOL_NAME_INSTRUCTOR', 'INSTRUCTOR_WEIGHT', 'INSTRUCTOR_ENROLLMENT_STATUS', 'VALID_CASE')
setcolorder(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER, tmp.column.order)
setkeyv(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER, c("ID", "CONTENT_AREA", "YEAR"))


### Save results

save(Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER, file="Data/Hawaii_Data_LONG_2015_INSTRUCTOR_NUMBER.Rdata")