-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunctions.py
336 lines (295 loc) · 11.6 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# (c) Ryan Martin 2017 under MIT license
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
class AutocorrClus:
"""
Parameters:
mvdata: pd.DataFrame
the multivariate dataframe
locations: pd.DataFrame
the Cartesian locations of the data
target_nclus: int
the number of clusters
clustermethod: str
can be one of 'kmeans', 'gmm', or 'hier'
autocor: str
the autocorrelation metric to consider, options are 'morans', 'getis', 'localmean', or
'localvar'
nnears: int
the number of nearest neighbors in a local search to consider
aniso_search: 5-tuple
(ang1, ang2, ang3, r1, r2) anisotropic properties following the GSLIB rotation conventions
.. codeauthor:: Ryan Martin - 17-10-2017
"""
def __init__(self, mvdata, locations, target_nclus=None, clustermethod='kmeans',
autocor='getis', nnears=45, aniso_search=(0, 0, 0, 1, 1)):
# parse the input, make sure arrays are nvar x ndata dimensioned
if hasattr(mvdata, 'values'):
mvdata = mvdata.values.T
elif isinstance(mvdata, np.ndarray):
if mvdata.shape[0] > mvdata.shape[1]:
mvdata = mvdata.T
if hasattr(locations, 'values'):
locations = locations.values.T
elif isinstance(locations, np.ndarray):
if locations.shape[0] > locations.shape[1]:
locations = locations.T
# set all the parmeters to this object
self.mvdata = mvdata
self.locations = locations
self.target_nclus = target_nclus
self.clustermethod = clustermethod
self.aniso_search = aniso_search
self.nnears = nnears
autocordict = dict(morans=1, getis=2, localvar=3, localmean=4) # values for fortran
self.autocor = autocordict[autocor]
def fit_predict(self, target_nclus=None):
"""
Call all the functions to get a cluster using the autocorrelation metrics and the data
passed to the constructor of this class
"""
from pandas import DataFrame
from autocorr import autocorr
# initialize the spatial locations for autocorrelation
autocorr.initautocorr(self.locations, self.aniso_search)
# get the autocorrelation dataset
tdata = autocorr.localautocorrelation(self.autocor, 1, self.nnears, self.mvdata).T
if target_nclus is not None:
self.target_nclus = target_nclus
assert self.target_nclus is not None, " set a number of clusters ! "
clusdefs = cluster(self.target_nclus, tdata, algorithm=self.clustermethod)
return clusdefs
def get_catcmap(cmap, ncat):
rgb_fr_contmap = []
cm = plt.cm.get_cmap(cmap)
for num in range(ncat):
rgb_fr_contmap.append(cm((num + 0.5) / (ncat)))
return ListedColormap(rgb_fr_contmap, name=cmap)
class MDSPlot:
"""
Wrapper for MDS plotting for a given dataset, passed on init
Parameters:
data (dataframe or np.ndarray): the data to embed with MDS
mtype (str): the embedding type, valid are 'mds' and 'tsne'
embedding (int): the number of dimensions to embed the coordinates in
randstate (int): the random state for embedding
"""
def __init__(self, data, mtype='MDS', embedding=2, randstate=69069, verbose=False,
autofit=True):
self.data = data
self.mtype = mtype
self.embedding = embedding
self.randstate = randstate
self.verbose = verbose
self.coords = None
def embed(self, data=None):
"""
Quick wrapper around sklearn manifold for MDS and TSNE
Parameters:
data (dataframe or np.ndarray):
"""
from sklearn import manifold
if data is None:
data = self.data
np.random.RandomState(self.randstate)
if self.mtype.lower() == 'mds':
model = manifold.MDS(n_components=self.embedding, random_state=self.randstate,
verbose=self.verbose)
if self.mtype.lower() == 'tsne':
model = manifold.t_sne.TSNE(n_components=self.embedding, random_state=self.randstate,
verbose=self.verbose)
nrow, ncol = np.shape(data)
if ncol > nrow:
data = data.T
model = model.fit(data)
self.coords = model.embedding_ # the N-dimensional embedding of the data
def plot_mds(self, colors='k', cmap='viridis', catdata=False, pltstyle='pt8',
ax=None, cax=None, figsize=(2.5, 3), s=15, lw=0.1, cbar=None, grid=True,
legend_loc='lower right', title=None, vlim=None, legstr='Cluster',
xlabel=None, ylabel=None):
"""
Plotting for the MDS coordinates and the probabilities or categories
"""
coords = self.coords
# setup the figure
if ax is None:
_, ax = plt.subplots(figsize=figsize)
if vlim is None and colors is not None and not isinstance(colors, str):
vlim = (np.min(colors), np.max(colors))
else:
vlim = (None, None)
# deal with non-array input
if hasattr(colors, 'values'):
colors = colors.values
if len(np.unique(colors)) <= 12:
catdata = True
# plot categories
if catdata:
clusids = np.unique(colors)
ncat = len(clusids)
cmap = get_catcmap(cmap, ncat)
dump = colors.copy()
for i in range(ncat):
dump[colors == clusids[i]] = i
colors = dump
ax.scatter(coords[:, 0], coords[:, 1], c=colors, cmap=cmap, s=s, lw=lw,
label=legstr)
if isinstance(legend_loc, str):
ax.legend(loc=legend_loc, scatterpoints=1, handletextpad=0.1)
elif isinstance(legend_loc, tuple):
ax.legend(loc='upper left', bbox_to_anchor=legend_loc, scatterpoints=1,
handletextpad=0.1)
# plot continous data with a colorbar
else:
ax.scatter(coords[:, 0], coords[:, 1], c=colors, s=s, lw=lw, cmap=cmap,
vmin=vlim[0], vmax=vlim[1])
applytickpad(ax, 1)
addgridtoplt(ax)
if ylabel is None:
ax.set_ylabel('$MDS_2$')
else:
ax.set_ylabel(ylabel)
if xlabel is None:
ax.set_xlabel('$MDS_1$')
else:
ax.set_xlabel(xlabel)
applylabelpad(ax, 0.5)
if title:
ax.set_title(title)
return ax
def cluster(nclus, dfs, n_init=1, variables=None, algorithm='kmeans', ret_gmm=False):
"""
Wrapper around Gaussian Mixture and Kmeans
Parameters:
nclus (int): number of clusters
dfs (pd.DataFrame): the dataframe
n_init (int): the number of times to initialize the clusterer
variables (list): the list of variables to take from the dataframe
algorithm (str): clustering type, `kmeans` or `gmm`
ret_gmm (bool): either `true` for returning the mixture or false to return just the
clustering
Returns:
clusdef +- gmm depending on ret_gmm
"""
from sklearn.cluster import KMeans, hierarchical
from sklearn.mixture import GaussianMixture
# get the data for clustering:
if variables is not None:
nd_pts = dfs[variables].values
else:
nd_pts = dfs
nclus = int(nclus)
if algorithm.lower() == 'kmeans':
clusdef = KMeans(n_clusters=nclus, n_init=n_init).fit(nd_pts).labels_
elif algorithm.lower() == 'gmm':
gmm = GaussianMixture(n_components=nclus, n_init=n_init)
gmm = gmm.fit(nd_pts)
clusdef = np.array(gmm.predict(nd_pts))
elif algorithm.lower() == 'hier':
hier = hierarchical.AgglomerativeClustering(n_clusters=nclus, linkage='ward')
clusdef = hier.fit_predict(nd_pts)
if ret_gmm:
return clusdef, gmm
else:
return clusdef
def fixlabels(ax, tickpad=1.55, labelpad=0.45):
applylabelpad(ax, labelpad)
applytickpad(ax, tickpad)
def label_subplot(ax, fignum=None, labels=True, case='lower', annot_xy=(0.0, 1.03), ha='left',
va='bottom', fontsize=10, **kwargs):
"""
Use the module variables and the default properties listed herein to label the axis given
the ax and the figure number
NEW:
Optionally omit the figure number, and set kwarg `labels` to type str, will label the string at
the given location!
"""
""" set of subplot labels for label_subplot() """
figurelabels = ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)', '(g)', '(h)',
'(i)', '(j)', '(k)', '(l)', '(m)', '(n)', '(o)', '(p)',
'(q)', '(r)', '(s)', '(t)']
figurenumbers = ['(i)', '(ii)', '(iii)', '(iv)', '(v)', '(vi)', '(vii)',
'(viii)', '(ix)', '(x)', '(xi)', '(xii)', '(xiii)', '(xiv)']
if isinstance(labels, str):
lbl = labels
else:
if labels:
lbls = figurelabels
else:
lbls = figurenumbers
if case == 'lower':
lbl = lbls[fignum].lower()
else:
lbl = lbls[fignum].upper()
ax.annotate(lbl, xy=annot_xy, ha=ha, va=va, xycoords='axes fraction', fontsize=fontsize,
**kwargs)
def reclass_cluster_single(clus1, clus2):
"""
Use the a pairwise similarity score to determine the best reclassification of cluster 2
relative to clustering 1
Note: directly calls the fortran codes which calculates the similarity metrics using
permutations
Parameters
----------
clus1: np.ndarray
the first array of clusterings
clus2: np.ndarray
the second array of clusterings
Returns
-------
reclus: np.ndarray
the relclassified clus2
diff: float
the matching score between clus1 and the final reclassified clusters ... not the jaccard!
"""
from clus_compare import clus_compare_mod
reclus, diff = clus_compare_mod.reclass_cluster_single(clus1, clus2)
return reclus, diff
def simple_density_calc(x, y, ncomp, niter=50, seed=69069):
from sklearn.mixture import GaussianMixture
if hasattr(x, 'values'):
x = x.values
if hasattr(y, 'values'):
y = y.values
sample = np.c_[x, y]
gmm = GaussianMixture(n_components=ncomp, n_init=niter)
gmm = gmm.fit(sample)
return np.exp(gmm.score_samples(sample))
def standardize(data, variables=None):
"""
Subtract the mean and standardize by the stdev, returning a dataframe with the new variables,
and the means and standard deviations used to normalize.
"""
if variables is not None:
data = data[variables]
return_df = False
if hasattr(data, 'values'):
arr = data.values
return_df = True
ncol = np.size(arr, 1)
means = []
stds = []
for i in range(ncol):
mean = np.mean(arr[:, i])
std = np.std(arr[:, i])
arr[:, i] -= mean
arr[:, i] /= std
means.append(mean)
stds.append(std)
if return_df:
import pandas as pd
columns = ['std_%s' % var for var in data.columns]
arr = pd.DataFrame(arr, columns=columns)
return arr, means, stds
def addgridtoplt(ax=None):
""" Add the standard pygeostat grid to the current plot"""
if ax is None:
fig = plt.gcf()
ax = fig.axes[0]
ax.grid(which='major', color='k', linestyle=':', lw=0.2)
def applylabelpad(ax, labelpad):
ax.yaxis.labelpad = labelpad
ax.xaxis.labelpad = labelpad
def applytickpad(ax, tickpad):
ax.tick_params(axis='both', which='major', pad=tickpad)