-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
127 lines (103 loc) · 3.81 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from itertools import cycle, islice
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
def gaussian_kernel(A, mult=2, is_sym=False):
"""
Apply gaussian kernel on matrix A.
A(i,j) = exp(D(i,j)^2/(mult*variance(D)))
Args:
A: square distance matrix
mult: kernel denominator multiplier
Returns:
D: distance matrix after gaussian kernel application
"""
if A.shape[0] != A.shape[1]:
raise ValueError('Dimensionality: input matrix is not a square matrix.')
n = len(A)
D = np.zeros(A.shape)
var_D = np.var(A)
for i in range(n):
if is_sym:
for j in range(i, n):
res = np.exp(-(A[i,j] ** 2)/(mult*var_D))
D[i,j] = res
D[j,i] = res
else:
for j in range(n):
D[i,j] = np.exp(-(A[i,j] ** 2)/(mult*var_D))
return D
def generate_cluster_matrix(A, clusters):
"""
Generate the affinity matrix reordered according to clustering
process result.
Args:
A: affinity matrix
clusters: vector of cluster identifiers for each point in A
Returns:
res: reordered affinity matrix
"""
res = np.empty_like(A)
supp = []
for i in range(len(clusters)):
for j in range(len(clusters[i])):
supp.append(clusters[i][j])
for i in range(len(supp)):
for j in range(len(supp)):
res[i, j] = A[supp[i], supp[j]]
return res
def plot_clustering_result(X, A, clusters, noise=None, clustering_name="Generic clustering"):
"""
Plot result of clustering process in a 2D space.
Args:
X: list of points
A: affinity matrix between points in X
clusters: vector of cluster identifiers for each point in X
noise: vector of identifier for noise points
"""
colors = list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']), len(clusters) + 1))
symbols = list(islice(cycle(["*","x","+","o",".","^","<",">","P","p","X","D","d"]),
len(clusters) + 1))
plt.figure(num=clustering_name)
plt.subplot(2, 2, 1)
plt.scatter(X[:, 0], X[:, 1], s=10)
plt.grid()
plt.title("Original Data")
plt.subplot(2, 2, 2)
plt.title("Affinity matrix")
sns.heatmap(A, square = True)
plt.subplot(2, 2, 3)
plt.grid()
for i in range(len(clusters)):
plt.scatter(X[clusters[i], 0], X[clusters[i], 1], s=90, color=colors[i], marker=symbols[i])
plt.title("Clustered data (" + str(len(clusters)) + " clusters found)")
plt.subplot(2, 2, 4)
plt.title("Clusters affinity matrix")
sns.heatmap(generate_cluster_matrix(A, clusters), square = True)
manager = plt.get_current_fig_manager()
manager.window.showMaximized()
plt.show()
def generate_dataset(n_samples=1500, shape="blobs"):
"""
Generate a dataset according to inputs.
Args:
n_samples: number of points to generate
shape: string identifing the shape of dataset (blobs/varied/moons/circles)
Returns:
X: list of coordinates
"""
np.random.seed(0)
if shape == "blobs":
dataset = datasets.make_blobs(n_samples=n_samples, random_state=8)
if shape == "varied":
dataset = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=170)
if shape == "moons":
dataset = datasets.make_moons(n_samples=n_samples, noise=.05)
if shape == "circles":
dataset = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
X, _ = dataset
X = StandardScaler().fit_transform(X) # normalize dataset for easier parameter selection
return X