-
Notifications
You must be signed in to change notification settings - Fork 0
/
14 - Fatality Dataset Preparation.R
200 lines (173 loc) · 9.82 KB
/
14 - Fatality Dataset Preparation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# Title : Dataset Preparation
# Objective : Prepare my main dataset for analysis
# Created by: Matthew Chambers
# Created on: 7/8/2020
# Load the necessary packages. Tidyverse is for general data manipulation, magrittr supplies the pipe operators
# such as %>% and %<>%, and lubridate provides tools for working intelligently with dates.
library(magrittr)
library(tidyverse)
library(lubridate)
# Define the years to be covered. Note that OSHA accidents don't consistently have the establishment's
# NAICS code until 2003.
begin_year <- 2003
end_year <- 2015
# Define input parameters for the different sources of data.
osha_data_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/OSHA data scraping'
osha_file <- function (year) return (paste0('osha_fatality_catastrophe_incidents_', year, '.csv'))
# I have a file with geocoded addresses to label the OSHA incidents with.
geocode_file <- 'geocoded_addresses.csv'
qcew_data_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/QCEW'
qcew_file <- function (year) return (paste0(year, '.q1-q4.singlefile.csv'))
inversion_data_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/Inversion data'
# Uncomment the line below (and one under output_dir, below, after running script number 11 to create a data file
# that uses nighttime inversions only (as a robustness check)
#inversion_data_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/Inversion data/Nighttime Inversions'
inversion_file <- function (year) return (paste0('inversions_', year, '.csv'))
weather_data_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/Weather Data'
weather_file <- function (year) return (paste0(year, '_mean_temp_mean_precip_data.csv'))
pollution_data_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/Pollution Data'
pollution_file <- function (year) return (paste0('PM25_prediction_', year, '.csv'))
# Define output parameters for the cleaned data file(s) to be saved.
output_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/Data for Regression Models/Fatality Only'
# Uncomment the line below to send output to a directory set up for nighttime inversion data (as a robustness check)
#output_dir <- 'E:/Research Projects/Worker Accidents and Pollution/Data/Data for Regression Models/Nighttime Inversions'
output_file <- function (type = NULL) return(paste0('construction_accidents_', type, '.csv'))
# Define functions to read in the different types of files, incorporating whatever tweaks need to be made
# to read in cleanly. Note that in the column type strings, either _ or - omits the column, but alternating
# makes counting easier.
read_in_osha <- function (year) {
osha_input_column_types <- '_-_ic_-_-_-ccc-_-c_-_-_-_-_-c-_'
X <- read_csv(file.path(osha_data_dir, osha_file(year)), col_types = osha_input_column_types, progress = FALSE) %>%
# For some reason, reading the date column in as a date doesn't work, so I have to read it in as character and
# parse it to a date here.
mutate(event_date = mdy(event_date)) %>% rename(date = event_date) %>%
mutate(naics = gsub('X', '', naics) %>% as.integer()) %>%
mutate(sic = gsub('X', '', sic) %>% as.integer()) %>%
mutate(fatality = degree == 'Fatality') %>%
group_by(date, est_address) %>% mutate(fatality = max(fatality)) %>% ungroup() %>%
# Drop duplicate observations
distinct(date, est_address, .keep_all = TRUE) %>%
# This is the point of this file, to identify which incidents match different possible catastrophe definitions.
filter(fatality == TRUE) %>% select(-fatality, -degree)
return (X)
}
read_in_qcew <- function (year) {
qcew_input_column_types <- 'iici_ii_iiii_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-'
X <- read_csv(file.path(qcew_data_dir, qcew_file(year)), col_types = qcew_input_column_types, progress = FALSE) %>%
# Some rows have parsing errors due to fips codes beginning with 'C'. These are statistical areas which overlap
# counties and must therefore be excluded to avoid double counting. Because of the parsing errors, these have
# NA in the area_fips column.
drop_na(area_fips) %>% rename(fips = area_fips)
return (X)
}
read_in_inversions <- function (year) {
inversion_input_column_types <- 'iD_-_d'
X <- read_csv(file.path(inversion_data_dir, inversion_file(year)),
col_types = inversion_input_column_types, progress = FALSE)
return (X)
}
read_in_geocodes <- function () {
geocode_input_column_types <- '_-_-_-_-_-c_-i'
X <- read_csv(file.path(osha_data_dir, geocode_file),
col_types = geocode_input_column_types, progress = FALSE)
return (X)
}
read_in_weather <- function (year) {
weather_input_column_types <- 'iDdd'
X <- read_csv(file.path(weather_data_dir, weather_file(year)),
col_types = weather_input_column_types, progress = FALSE)
return (X)
}
read_in_pollution <- function (year) {
pollution_input_column_types <- 'iDd'
X <- read_csv(file.path(pollution_data_dir, pollution_file(year)),
col_types = pollution_input_column_types, progress = FALSE)
return (X)
}
process_qcew_year <- function (year) {
qcew_data <- read_in_qcew(year)
qcew_data %<>%
# I only want to consider privately owned establishments, as OSHA's applicability to government workplaces
# is inconsistent.
filter(own_code == 5) %>% select(-own_code) %>%
# Per https://data.bls.gov/cew/doc/titles/agglevel/agglevel_titles.htm, aggregation level 74 represents data
# aggregated by county, sector (2 digit NAICS), and ownership groups. Aggregation level 71 contains county level
# total employment, which I also need. NAICS code 23 represents the construction industry, which I am
# currently focusing on. If I come back and decide to expand that, I need to not drop the industry code.
filter((agglvl_code == 74 & industry_code == 23) | agglvl_code == 71) %>% select(-industry_code) %>%
# I rename the monthly employment level variables for ease of working with the pivot_longer function.
rename(emp_m1 = month1_emplvl) %>% rename(emp_m2 = month2_emplvl) %>% rename(emp_m3 = month3_emplvl) %>%
# I reshape the data into a monthly, rather than quarterly, form.
pivot_longer(
cols = starts_with('emp_m'),
names_to = 'month_in_qtr',
names_prefix = 'emp_m',
values_to = 'employment'
) %>% mutate(month_in_qtr = month_in_qtr %>% as.integer()) %>% # month_in_qtr should be an integer for math
# I generate a month variable from the quarter and month-in-quarter variables (qtr and month_in_qtr).
mutate(qtr = qtr - 1) %>% mutate(month = (month_in_qtr + qtr * 3) %>% as.integer()) %>%
# I don't really need to keep the quarter and month-in-quarter variables around any longer.
select(-c('qtr', 'month_in_qtr')) %>%
# I now create a new variables for total employment using aggregation level 71, then group observations
# to fill that value in for all aggregation level 74 (specific industry) observations
mutate(total_employment = ifelse(agglvl_code == 71, employment, NA)) %>%
group_by(fips, month) %>% fill(total_employment, .direction = 'downup') %>% ungroup() %>%
filter(agglvl_code == 74) %>% select(-agglvl_code)
return (qcew_data)
}
process_osha_year <- function (year) {
osha_data <- read_in_osha(year)
geocoded_addresses <- read_in_geocodes()
osha_data %<>%
# I only want to consider privately owned establishments, as OSHA's applicability to government workplaces
# is inconsistent.
filter(ownership == 'Private') %>% select(-ownership) %>%
# First, to save time and memory, I filter to construction only and drop the NAICS code
mutate(naics_2 = floor(naics / 10000) %>% as.integer()) %>%
filter(naics_2 == 23) %>% select(-c('naics', 'naics_2', 'sic')) %>%
# Now I merge in the FIPS code from my geocoded address file, and drop any observations which could
# not be geocoded.
left_join(geocoded_addresses) %>% drop_na(fips) %>% select(-est_address) %>%
# I group by county and date, then get the sum of persons injured (in case of multiple accidents), as well as
# the number of incidents.
group_by(fips, date) %>% summarize(num_injured = num_injured %>% sum(), num_accidents = n())
return (osha_data)
}
process_one_year <- function (year) {
# Read in all the data I need for this year.
qcew_data <- process_qcew_year(year)
osha_data <- process_osha_year(year)
inversion_data <- read_in_inversions(year)
weather_data <- read_in_weather(year)
pollution_data <- read_in_pollution(year)
# I assemble my main data file. The inversion data for a given year are my starting point, since they
# constitute a nice balanced panel.
mydata <- inversion_data %>%
# First I merge in the weather data
left_join(weather_data) %>%
# Then I merge in the pollution data
left_join(pollution_data) %>%
# Then I merge in the OSHA accident data.
left_join(osha_data) %>%
# Replace NAs (days unrepresented in the OSHA data) with 0s, as they represent observations with no accident.
mutate(num_injured = num_injured %>% replace_na(0) %>% as.integer()) %>%
mutate(num_accidents = num_accidents %>% replace_na(0) %>% as.integer()) %>%
# Then I merge in the QCEW employment data, after creating year and month variables to merge on. I then
# drop the year and month variables since I no longer need them.
mutate(year = year(date)) %>% mutate(month = month(date)) %>%
left_join(qcew_data) %>% select(-c('year', 'month')) %>%
# I generate a weekday factor variables, since weekday effects are likely important.
mutate(weekday = wday(date) %>% as_factor())
return(mydata)
}
all_data <- NULL
for (i in begin_year:end_year) {
year_data <- process_one_year(i)
if (is.null(all_data)) {
all_data <- year_data
} else {
all_data %<>%
bind_rows(year_data)
}
}
write_csv(all_data, file.path(output_dir, output_file(paste0(begin_year, '_to_', end_year))))