-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanData.py
296 lines (207 loc) · 9.68 KB
/
cleanData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import sys
import pandas as pd
from openpyxl import load_workbook, Workbook
from config import *
from util import *
def main():
args = read_command()
print_warning('Cleaning data with params: ' + str(vars(args)))
print(OUTPUT_SEPARATOR)
if args.clean_prob: clean_prob()
if args.run_msa:
if args.clean_proj: clean_reg_proj()
if args.clean_emp: clean_reg_emp()
if args.merge: merge_reg()
if args.run_nat:
print_header('Cleaning national data')
if args.clean_proj: clean_nat_proj()
if args.clean_emp: clean_nat_emp()
if args.merge: merge_nat()
print_success('Finished data cleaning.')
def clean_nat_proj():
progress_bar = PercentBar('Projections', max=3)
raw_wb = load_workbook(filename=RAW_PROJECTIONS_NAT)
clean_wb = Workbook()
progress_bar.next()
raw_ws = raw_wb['Table 1.2']
clean_ws = clean_wb.active
clean_ws.title = 'Projections_Cleaned'
for i in range(4, raw_ws.max_row):
for j in range(1, raw_ws.max_column):
clean_ws.cell(row=i - 3,column=j).value = raw_ws.cell(row=i,column=j).value
progress_bar.next()
clean_ws.delete_cols(1, 1)
clean_ws.delete_cols(2, 6)
for row, cellObj in enumerate(list(clean_ws.columns)[1]):
if row != 0:
cellObj.value = (float(cellObj.value) / 100 + 1) ** 0.1 - 1
progress_bar.next()
clean_ws['A1'] = 'SOC_CODE'
clean_ws['B1'] = 'ANNUAL_CHANGE'
clean_wb.save(filename=CLEAN_PROJECTIONS_NAT)
progress_bar.finish()
def clean_nat_emp():
progress_bar = PercentBar('Employment', max=3)
# clean employment data
nat_employment_df = pd.read_excel(RAW_EMPLOYMENT_NAT)
progress_bar.next()
nat_filtered = nat_employment_df[['OCC_CODE', 'TOT_EMP', 'OCC_TITLE']]
progress_bar.next()
nat_cleaned = nat_filtered.rename(columns={'OCC_CODE': 'SOC_CODE'})
progress_bar.next()
nat_cleaned.to_excel(CLEAN_EMPLOYMENT_NAT, index=False)
progress_bar.finish()
def merge_nat():
progress_bar = PercentBar('Merging data', max=5)
auto_susceptibility_df = pd.read_excel(CLEAN_FREY_OSBORNE)
progress_bar.next()
employment_df = pd.read_excel(CLEAN_EMPLOYMENT_NAT)
progress_bar.next()
employment_proj_df = pd.read_excel(CLEAN_PROJECTIONS_NAT)
progress_bar.next()
full_employment_df = employment_df.merge(employment_proj_df, on='SOC_CODE')
progress_bar.next()
full_df = full_employment_df.merge(auto_susceptibility_df, on='SOC_CODE')
progress_bar.next()
full_df.to_excel(CLEAN_MERGED_NAT, index=False)
progress_bar.finish()
def merge_reg():
print_header('Merging MSA data sets')
auto_susceptibility_df = pd.read_excel(CLEAN_FREY_OSBORNE)
for msa in CA_MSA_MAP.keys():
progress_bar = PercentBar(msa, max=4)
msa_filename = msa + '.xlsx'
merged_filename = CLEAN_MERGED_MSA + msa_filename
employment_df = pd.read_excel(CLEAN_EMPLOYMENT_MSA + msa_filename)
progress_bar.next()
employment_proj_df = pd.read_excel(CLEAN_PROJECTIONS_MSA + msa_filename)
progress_bar.next()
employment_full_df = employment_df.merge(employment_proj_df, on='SOC_CODE')
progress_bar.next()
full_df = employment_full_df.merge(auto_susceptibility_df, on='SOC_CODE')
progress_bar.next()
full_df.to_excel(merged_filename, index=False)
progress_bar.finish()
print_success('Full cleaned CA data files written to ' + CLEAN_MERGED_MSA)
def clean_prob():
progress_bar = PercentBar('Cleaning occupational automation probabilities', max=3)
# clean automation susceptibility datasheet
auto_sus_df = pd.read_excel(RAW_FREY_OSBORNE)
progress_bar.next()
auto_sus_df = auto_sus_df[['Probability', 'SOC code']]
progress_bar.next()
auto_sus_df = auto_sus_df.rename(columns={'SOC code': 'SOC_CODE', 'Probability': 'AUTO_PROB'})
progress_bar.next()
auto_sus_df.to_excel(CLEAN_FREY_OSBORNE, index=False)
progress_bar.finish()
def clean_reg_proj():
print_header('Cleaning and aggregating regional data sheets')
for msa in CA_MSA_MAP.keys():
proj_files = CA_MSA_MAP[msa]
progress_bar = PercentBar(', '.join(proj_files), max=2*len(proj_files))
clean_reg_(proj_files, progress_bar)
aggregate_reg_(proj_files, msa, progress_bar)
progress_bar.finish()
print_success('CA employment projections cleaned and aggregated in ' + CLEAN_PROJECTIONS)
print(OUTPUT_SEPARATOR)
def clean_reg_emp():
print_header('Cleaning MSA employment data')
# clean employment data
print_warning('Reading MSA employment data...')
msa_employment_df = pd.read_excel(RAW_EMPLOYMENT_MSA)
for msa in CA_MSA_MAP.keys():
progress_bar = PercentBar(msa, max=4)
clean_filename = CLEAN_EMPLOYMENT_MSA + msa + '.xlsx'
progress_bar.next()
msa_queried = msa_employment_df.query('AREA_NAME == "' + msa + '" and TOT_EMP != "**"')
progress_bar.next()
msa_filtered = msa_queried[['OCC_CODE', 'TOT_EMP', 'OCC_TITLE']]
progress_bar.next()
msa_cleaned = msa_filtered.rename(columns={'OCC_CODE': 'SOC_CODE'})
progress_bar.next()
msa_cleaned.to_excel(clean_filename, index=False)
progress_bar.finish()
print_success('CA employment data extracted, cleaned, and written in ' + CLEAN_EMPLOYMENT_MSA)
print(OUTPUT_SEPARATOR)
# aggregate granular county projections into a mean metropolitan statistical area (msa) projection
def aggregate_reg_(proj_files, out_filename, progress_bar):
aggregated_filename = CLEAN_PROJECTIONS_MSA + out_filename + '.xlsx'
aggregate_df = pd.read_excel(CLEAN_PROJECTIONS_REGIONAL + proj_files[0])
progress_bar.next()
for i in range(1, len(proj_files)):
to_add_df = pd.read_excel(CLEAN_PROJECTIONS_REGIONAL + proj_files[i])
aggregate_df = aggregate_df.merge(to_add_df, on='SOC_CODE')
aggregate_df['ANNUAL_CHANGE'] = aggregate_df['ANNUAL_CHANGE_x'] + aggregate_df['ANNUAL_CHANGE_y']
del aggregate_df['ANNUAL_CHANGE_x']
del aggregate_df['ANNUAL_CHANGE_y']
progress_bar.next()
aggregate_df['ANNUAL_MEAN_CHANGE'] = aggregate_df['ANNUAL_CHANGE'].div(len(proj_files))
del aggregate_df['ANNUAL_CHANGE']
aggregate_df.to_excel(aggregated_filename, index=False)
# clean California MSA local projection data excel files
def clean_reg_(proj_files, progress_bar):
for filename in proj_files:
raw_filename = RAW_PROJECTIONS_REGIONAL + filename
clean_filename = CLEAN_PROJECTIONS_REGIONAL + filename
raw_wb = load_workbook(filename=raw_filename)
clean_wb = Workbook()
raw_ws = raw_wb.active
clean_ws = clean_wb.active
clean_ws.title = 'Occupational_Cleaned'
for i in range(4, raw_ws.max_row):
for j in range(1, raw_ws.max_column):
clean_ws.cell(row=i - 3,column=j).value = raw_ws.cell(row=i,column=j).value
clean_ws.delete_cols(1, 1)
clean_ws.delete_cols(2, 4)
clean_ws.delete_cols(3, 8)
clean_ws.delete_rows(2, 1)
# should delete all rows which are meaningless at the end
empty_rows = 18
clean_ws.delete_rows(clean_ws.max_row - empty_rows, empty_rows + 1)
for row, cellObj in enumerate(list(clean_ws.columns)[1]):
if row != 0:
cellObj.value = (float(cellObj.value) + 1) ** 0.1 - 1
clean_ws['A1'] = 'SOC_CODE'
clean_ws['B1'] = 'ANNUAL_CHANGE'
clean_wb.save(filename=clean_filename)
progress_bar.next()
def read_command():
"""
Read command line arguments.
"""
from argparse import ArgumentParser
parser = ArgumentParser(description=('Clean data for the simulation. '
'Default behavior is to clean and merge all national-level and regional/MSA-level data files. '
'Use options to run only specified steps of the program.'))
parser.add_argument('--clean-prob', dest='clean_prob',
default=False, action='store_true',
help='clean automation probabilities excel sheet')
parser.add_argument('--clean-proj', dest='clean_proj',
default=False, action='store_true',
help='clean and aggregate regional projections')
parser.add_argument('--clean-emp', dest='clean_emp',
default=False, action='store_true',
help='clean metropolitan employment data')
parser.add_argument('--merge', dest='merge',
default=False, action='store_true',
help='merge files into full MSA dataframes including probability, employment, and projections')
parser.add_argument('--nat', dest='run_nat',
default=False, action='store_true',
help='clean only national-level data files')
parser.add_argument('--msa', dest='run_msa',
default=False, action='store_true',
help='clean only regional/MSA-level data files')
args = parser.parse_args()
if all([not args.clean_prob, not args.clean_proj, not args.clean_emp, not args.merge]):
print_warning('No cleaning options specified. Running with options set to clean all data.')
args.clean_prob = True
args.clean_proj = True
args.clean_emp = True
args.merge = True
if all([not args.run_nat, not args.run_msa]):
print_warning('National/MSA-level cleaning not specified. Running with options set to clean both levels of data.')
args.run_nat = True
args.run_msa = True
return args
if __name__ == "__main__":
main()