-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstate_data.py
executable file
·227 lines (204 loc) · 10.5 KB
/
state_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""Helper functions for loading state-level datasets."""
import numpy as np
import pandas as pd
import geopandas as gpd
from typing import Dict
from config import PROJ, path
NATIONAL_DATASETS = {
'states': 'cb_2018_us_state_500k/cb_2018_us_state_500k.shp',
'hospitals': 'Hospitals/Hospitals.shp',
'ed_inst': 'Colleges_and_Universities/Colleges_and_Universities.shp'
}
MA_DATASETS = {
'acute_care': 'MA_Hospitals/Originals/acute_care_hospitals/HOSPITALS_PT.shp',
'non_acute_care': 'MA_Hospitals/Originals/non_acute_care_hospitals/HOSPITALS_NONACUTE_PT.shp',
'ed_inst': 'MA_Universities/SHP_dormcap/ma_universities.shp'
}
# Special cases to handle in MA (not necessarily completely filtered out)
MA_IRREGULAR_COLLEGES = [
'Northeastern University',
'Boston College',
'University of Massachusetts Dartmouth Center for Innovation and Entrepreneurship'
]
TRAVEL_TIME_DATASETS = {
state: f'travel_times/{state}_pairwise_distances_with_names.csv'
for state in ('CA', 'MI', 'NY')
}
def load_states():
"""Loads a shapefile of state boundaries."""
return gpd.read_file(path(NATIONAL_DATASETS['states']))
def load_state_data(state_code: str,
min_dorm_beds: int = 1,
min_hosp_beds: int = 1,
prefer_travel_time: bool = False,
acute_care_only: bool = False) -> Dict:
"""Loads basic hospital and educational institution data for a state.
:param state_code: The state's two-letter postal code.
:param min_dorm_beds: The minimum number of dorm beds necessary for an
educational institution's inclusion. At a minimum, institutions
with zero dorm beds (pure commuter schools) should be filtered out.f
:param min_hosp_beds: The minimum number of dorm beds necessary for an
hospital's inclusion. At a minimum, hospitals with zero
or negative beds should be filtered out.
:param prefer_travel_time: If available, use travel times (in minutes)
for distance calculations instead of Euclidean distance.
:param acute_care_only: Only include hospitals that are explicitly
labeled as acute-care hospitals.
:return: A dictionary of datasets and pairwise distances.
"""
state_code = state_code.upper()
states_gdf = load_states()
outline_gdf = states_gdf[states_gdf['STUSPS'] == state_code]
outline_gdf = outline_gdf.to_crs(PROJ).reset_index().copy()
hospitals_gdf = load_hospitals(state_code, min_hosp_beds, acute_care_only)
ed_inst_gdf = load_ed_inst(state_code, min_dorm_beds)
# Get pairwise distances.
if prefer_travel_time and state_code in TRAVEL_TIME_DATASETS:
distance_metric = 'travel_time'
travel_time_df = pd.read_csv(path(TRAVEL_TIME_DATASETS[state_code]))
travel_time_df = travel_time_df.rename(columns={
**{col: col.strip() for col in travel_time_df.columns},
' Driving Time (s)': 'Time'
})
distances = travel_time_distances(travel_time_df,
hospitals_gdf,
ed_inst_gdf)
else:
distance_metric = 'euclidean'
distances = euclidean_distances(hospitals_gdf, ed_inst_gdf)
return {
'outline': outline_gdf,
'ed_inst': ed_inst_gdf,
'hospitals': hospitals_gdf,
'distances': distances,
'distance_metric': distance_metric
}
def load_hospitals(state_code: str,
min_hosp_beds: int,
acute_care_only: bool) -> gpd.GeoDataFrame:
"""Loads filtered hospital data for a state."""
if state_code == 'MA':
# MA: Use state-specific hospital/university datasets.
acute_care_gdf = gpd.read_file(path(MA_DATASETS['acute_care']))
non_acute_care_gdf = gpd.read_file(path(MA_DATASETS['non_acute_care']))
acute_care_gdf['ACUTE'] = True
acute_care_gdf = acute_care_gdf.to_crs(PROJ)
if acute_care_only:
hospitals_gdf = acute_care_gdf.drop(columns=set(acute_care_gdf.columns) -
set(non_acute_care_gdf.columns))
else:
non_acute_care_gdf['ACUTE'] = False
non_acute_care_gdf = non_acute_care_gdf.to_crs(PROJ)
non_acute_care_gdf = non_acute_care_gdf.rename(columns={'FAC_NAME':
'SHORTNAME'})
hospitals_gdf = gpd.GeoDataFrame(pd.concat([
acute_care_gdf,
non_acute_care_gdf
])).drop(columns=set(acute_care_gdf.columns) -
set(non_acute_care_gdf.columns)) \
.rename(columns={'SHORTNAME': 'NAME', 'BEDCOUNT': 'BEDS'})
hospitals_gdf.crs = PROJ
# Filter out islands.
hospitals_gdf = hospitals_gdf[(hospitals_gdf['TOWN'] != 'Oak Bluffs') &
(hospitals_gdf['TOWN'] != 'Nantucket')]
# Filter out psychiatric hospitals.
# (Other non-acute hospitals may be retained.)
# Moon: Filter out Lawrence Memorial.
hospitals_gdf = hospitals_gdf[
(hospitals_gdf['COHORT'] != 'Psychiatric Hospital') &
(hospitals_gdf['NAME'] != 'Corrigan Mental Health Center') &
(hospitals_gdf['NAME'] != 'Lawrence Memorial Hospital of Medford')
]
# Hospital systems should be a category. (TODO: other fields here)
hospitals_gdf['HOSPSYSTEM'] = hospitals_gdf['HOSPSYSTEM'].astype('category')
else:
all_hospitals_gdf = gpd.read_file(path(NATIONAL_DATASETS['hospitals']))
hospitals_gdf = all_hospitals_gdf[all_hospitals_gdf['STATE'] == state_code]
hospitals_gdf = hospitals_gdf.copy()
hospitals_gdf['ACUTE'] = True
acute_rows = (hospitals_gdf['TYPE'] == 'GENERAL ACUTE CARE')
hospitals_gdf.loc[~acute_rows, 'ACUTE'] = False
if acute_care_only:
hospitals_gdf = hospitals_gdf[acute_rows]
hospitals_gdf = hospitals_gdf.to_crs(PROJ)
hospitals_gdf = hospitals_gdf[hospitals_gdf['BEDS'] >= min_hosp_beds]
return hospitals_gdf.reset_index().copy()
def load_ed_inst(state_code: str, min_dorm_beds: int) -> gpd.GeoDataFrame:
"""Loads filtered educational institution data for a state."""
if state_code == 'MA':
# MA: Use state-specific hospital/university datasets.
ed_inst_gdf = gpd.read_file(path(MA_DATASETS['ed_inst']))
# Fix irregularities in NEU data and remove satellite BC/UMD campuses.
ed_inst_gdf = ed_inst_gdf[~ed_inst_gdf['COLLEGE'].isin(MA_IRREGULAR_COLLEGES) |
(ed_inst_gdf['CAMPUS'] == 'Main Campus')]
# HMS name cleanup to make maps more interpretable
ed_inst_gdf['COLLEGE'] = ed_inst_gdf['COLLEGE'].replace({
'Harvard University': 'Harvard Medical School'
})
ed_inst_gdf = ed_inst_gdf.rename(columns={'DORMCAP': 'DORM_CAP'})
else:
# Non-MA: Use national hospital/university datasets.
all_ed_inst_gdf = gpd.read_file(path(NATIONAL_DATASETS['ed_inst']))
ed_inst_gdf = all_ed_inst_gdf[all_ed_inst_gdf['STATE'] == state_code]
ed_inst_gdf = ed_inst_gdf.to_crs(PROJ)
ed_inst_gdf = ed_inst_gdf[ed_inst_gdf['DORM_CAP'] >= min_dorm_beds]
return ed_inst_gdf.reset_index().copy()
def euclidean_distances(hospitals_gdf: gpd.GeoDataFrame,
ed_inst_gdf: gpd.GeoDataFrame) -> np.ndarray:
"""Calculates pairwise Euclidean distances."""
distances = np.zeros((len(hospitals_gdf), len(ed_inst_gdf)))
for hosp_idx, hosp_row in enumerate(hospitals_gdf.itertuples()):
for ed_idx, ed_row in enumerate(ed_inst_gdf.itertuples()):
dist = hosp_row.geometry.distance(ed_row.geometry)
distances[hosp_idx, ed_idx] = dist
return distances
def travel_time_distances(travel_time_df: pd.DataFrame,
hospitals_gdf: gpd.GeoDataFrame,
ed_inst_gdf: gpd.GeoDataFrame,
epsilon: float = 1e-4,
default_time: float = 10000) -> np.ndarray:
"""Loads precomputed pairwise travel time distances.
:param travel_time_df: The precomputed table of pairwise travel times
(in Olivia Walch's format).
:param hospitals_gdf: Hospitals to calculate travel times for.
:param ed_inst_gdf: Educational institutions to calculate travel times for.
:param epsilon: The tolerance used for matching longitudes and latitudes.
:param default_time: The default time to use when a (hospital,
educational institution) pair is missing from the travel time data.
:return: A pairwise distance matrix (rows are hospitals, columns are
educational institutions).
"""
times = default_time * np.ones((len(hospitals_gdf), len(ed_inst_gdf)))
travel_time_index = {}
for row in travel_time_df.itertuples():
source = getattr(row, 'Source')
dest = getattr(row, 'Destination')
travel_time = getattr(row, 'Time') / 60
travel_time_index[f'{source} -> {dest}'] = travel_time
for hosp_idx, hosp_row in enumerate(hospitals_gdf.itertuples()):
for ed_idx, ed_row in enumerate(ed_inst_gdf.itertuples()):
# Try to match based on name, and then disambiguate based on long/lat.
ed_name = getattr(ed_row, 'NAME').replace(',', '')
hosp_name = getattr(hosp_row, 'NAME').replace(',', '')
if f'{ed_name} -> {hosp_name}' in travel_time_index:
travel_time = travel_time_index[f'{ed_name} -> {hosp_name}']
else:
ed_lat = getattr(ed_row, 'LATITUDE')
ed_long = getattr(ed_row, 'LONGITUDE')
hosp_lat = getattr(hosp_row, 'LATITUDE')
hosp_long = getattr(hosp_row, 'LONGITUDE')
df = travel_time_df
travel_row = df[((df['SourceLat'] - ed_lat).abs() <= epsilon) &
((df['SourceLong'] - ed_long).abs() <= epsilon) &
((df['DestLat'] - hosp_lat).abs() <= epsilon) &
((df['DestLong'] - hosp_long).abs() <= epsilon)]
if not travel_row.empty:
if len(travel_row) > 1:
print('Warning:', ed_name, '->', hosp_name, 'ambiguous')
travel_time = travel_row['Time'].iloc[0] / 60
else:
print('Warning: could not find travel time for',
f'{ed_name} -> {hosp_name}')
continue
times[hosp_idx, ed_idx] = travel_time
return times