-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.py
152 lines (119 loc) · 4.32 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Experiments with dimensionality reduction and clustering time series to find similarities in US states' Covid-19 trajectories.
"""
import streamlit as st
from sklearn.preprocessing import minmax_scale, scale
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import OPTICS
import numpy as np
from process_data import process_data
def user_interface():
st.write("# Clustering States By Covid-19 Trends")
cols = [
"inIcuCurrently",
"hospitalizedCurrently",
"deathIncrease",
"positiveIncrease",
"percentPositive",
"totalTestResultsIncrease",
"Case Fatality Rate",
"Infection Fatality Rate",
]
cols.extend(
[
"retail_and_recreation_percent_change_from_baseline",
"grocery_and_pharmacy_percent_change_from_baseline",
"parks_percent_change_from_baseline",
"transit_stations_percent_change_from_baseline",
"workplaces_percent_change_from_baseline",
"residential_percent_change_from_baseline",
]
)
cols = st.multiselect("Columns to Use", cols, default="hospitalizedCurrently")
scale_method = st.selectbox(
"Preprocessing method", ["Minmax scale from 0-1", "Zscore", "None"]
)
do_scale = False
minmax = False
if scale_method == "Zscore":
do_scale = True
minmax = False
if scale_method == "Minmax scale from 0-1":
do_scale = False
minmax = True
min_samples = st.slider("Minimum number of states per cluster", 1, 10, 3)
return do_scale, minmax, cols, min_samples
def prepare_data(cols):
states = pd.read_csv("states_daily.csv")["state"].unique()[:]
states_data = {}
for state in states:
with st.spinner("Processing " + state):
df = process_data(False, state)
states_data[state] = df[cols].fillna(0)[-250:]
return states, states_data
def reduce_dims(do_scale, minmax, states_data, states, cols):
# PCA
reduced = {}
for state in states:
if do_scale:
X_valid_1D = scale(states_data[state])
elif minmax:
X_valid_1D = minmax_scale(states_data[state])
else:
X_valid_1D = states_data[state]
pca = PCA(n_components=1)
X_valid_1D = pca.fit_transform(X_valid_1D)
reduced[state] = [i[0] for i in X_valid_1D]
df = pd.DataFrame(reduced).interpolate()
return df, reduced
def group_states(clustering, reduced, df):
# Group states by cluster
buckets = {}
for l in clustering.labels_:
buckets[l] = []
for i, state in enumerate(reduced.keys()):
group = clustering.labels_[i]
buckets[group].append(state)
for group in set(clustering.labels_):
# Noisy samples and points which are not included in a leaf cluster of cluster_hierarchy_ are labeled as -1.
if group == -1:
continue
st.subheader("Group {}".format(group))
st.write(buckets[group])
st.line_chart(df[buckets[group]])
def plot_hierarchy(clustering, key):
import graphviz as graphviz
edges = clustering.cluster_hierarchy_
# Create a graphlib graph object
graph = graphviz.Digraph()
for edge in edges:
s1, s2 = key[edge[0]], key[edge[1]]
graph.edge(s1, s2)
st.graphviz_chart(graph)
def run_tsne(states, states_data):
tsne = TSNE()
state = st.selectbox("state", states)
st.write(states_data[state])
X_valid_2D = tsne.fit_transform(states_data[state])
X_valid_2D = (X_valid_2D - X_valid_2D.min()) / (X_valid_2D.max() - X_valid_2D.min())
plt.style.use("ggplot")
fig, ax = plt.subplots()
ax.scatter(X_valid_2D[:, 0], X_valid_2D[:, 1], s=10)
st.pyplot(fig)
def cluster_states():
do_scale, minmax, cols, min_samples = user_interface()
states, states_data = prepare_data(cols)
df, reduced = reduce_dims(do_scale, minmax, states_data, states, cols)
# Cluster
clustering = OPTICS(min_samples=min_samples).fit(df.transpose())
st.write("Resulting groups:", len(np.unique(clustering.labels_)) - 1)
st.header("Groups")
group_states(clustering, reduced, df)
st.subheader("All States")
st.line_chart(df)
st.stop()
if __name__ == "__main__":
cluster_states()