-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSubKMeans.py
207 lines (176 loc) · 6.68 KB
/
SubKMeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from Utilities import rvs
import numpy as np
import math
import random
"""
Python implementation of SubKMeans clustering algorithm
All credits go to authors of the paper "Towards an Optimal Subspace for K-Means"
Dominik Mautz, Wei Ye, Claudia Plant, Christian Böhm
KDD’17, August 13–17, 2017, Halifax, NS, Canada
"""
class SubKMeans:
# Calculates dataset (or cluster) mean
@staticmethod
def calculate_dataset_mean(X):
if len(X) <= 0:
return 0
mean = np.zeros(len(X[0]))
for x in X:
mean = np.add(mean, x)
for m in np.nditer(mean, op_flags=['readwrite']):
m[...] = m / len(X)
return mean
# Calculates scatter matrix of dataset (or cluster)
@staticmethod
def calculate_scatter_matrix(x):
d = len(x)
if d == 0:
return [0]
c = np.identity(d) - np.multiply(1. / d, np.ones((d, d)))
s_d = np.dot(x.T, c)
s_d = np.dot(s_d, x)
return s_d
# Returns projection matrix onto first m attributes
@staticmethod
def get_projection_matrix(m, d):
p = np.identity(m)
zeros = np.zeros((m, d - m))
p = np.append(p, zeros, 1)
return p
# Returns projection matrix for last (d-m) attributes
@staticmethod
def get_p_n(m, d):
p = np.identity(d - m)
zeros = np.zeros((d - m, m))
p = np.append(zeros, p, 1)
return p
# Returns n random datapoints from given dataset
@staticmethod
def get_random_datapoints(X, n):
return random.choices(X, k=n)
# Evaluates simplified cost function for given datapoint and cluster mean
@staticmethod
def cost_function_i(_x, _cluster_mean):
return math.pow(np.linalg.norm(_x - _cluster_mean), 2)
# Evaluates cost function for given datapoint and cluster mean
# DEPRECATED
def cost_function(self, x, cluster_mean):
_x = np.dot(self.pT, self.V.T)
_x = np.dot(_x, x)
_cluster_mean = np.dot(self.pT, self.V.T)
_cluster_mean = np.dot(_cluster_mean, cluster_mean)
return math.pow(np.linalg.norm(_x - _cluster_mean), 2)
# Performs eigendecomposition of scatter matrices
def eig(self):
sm = np.zeros((self.d, self.d))
for si in self.scatter_mx:
sm = np.add(sm, si)
sm = np.subtract(sm, self.S_D)
return np.linalg.eig(sm)
# Changes the number of dimensions of clustering subspace
# to the amount of negative eigenvalues
def updateM(self, E):
count = 0
for e in E:
if e < 0:
count = count + 1
return count
# Calculates cost function, which is supposed to be minimised.
# If value of cost_function is not changed through iterations, function returns true, meaning convergence
def convergence(self):
total_error = 0.
for error in self.cluster_errors:
total_error = total_error + error
# second part of cost function
_p_n = SubKMeans.get_p_n(self.m, self.d)
_phi_D = np.dot(_p_n, self.V.T)
_phi_D = np.dot(_phi_D, self.phi_D)
for x in self.X:
_x = np.dot(_p_n, self.V.T)
_x = np.dot(_x, x)
total_error = \
total_error + \
self.cost_function_i(_x, _phi_D)
prev = self.cost_function_value
self.cost_function_value = total_error
if math.fabs(prev - total_error) < 0.0001:
return True
# Initialises class, with given number of clusters
def __init__(self, n_clusters=2):
if not isinstance(n_clusters, int) or n_clusters <= 0:
raise ValueError('SubKMeans: Number of clusters should be positive integer')
self.cluster_weights = []
self.n_clusters = n_clusters
self.V = rvs()
self.m = 0
self.d = 0
self.phi_D = []
self.S_D = []
self.cluster_means = []
self.clusters = []
self.scatter_mx = []
self.labels_ = []
self.cluster_errors = []
self.cost_function_value = 0.
self.X = []
self.pT = []
# Performs (subspace) clustering over given dataset X
def fit(self, X):
if len(X) <= 0:
raise ValueError('SubKMeans: Dataset is corrupted')
# INITIALIZATION
self.X = X
self.d = len(X[0])
self.m = int(math.floor(math.sqrt(self.d)))
self.V = rvs(self.d)
self.phi_D = self.calculate_dataset_mean(X)
self.S_D = self.calculate_scatter_matrix(X)
self.cluster_means = self.get_random_datapoints(X, self.n_clusters)
self.pT = SubKMeans.get_projection_matrix(self.m, self.d)
# REPEAT
iterations = 0
while True:
# Renew info about clusters
self.clusters = []
self.scatter_mx = []
self.cluster_errors = []
self.cluster_weights = []
for i in range(0, self.n_clusters):
self.clusters.append([])
self.scatter_mx.append([])
self.cluster_errors.append(0)
_cluster_mean = np.dot(self.pT, self.V.T)
_cluster_mean = np.dot(_cluster_mean, self.cluster_means[i])
self.cluster_weights.append(_cluster_mean)
# ASSIGNMENT STEP
self.labels_ = []
for x in X:
_x = np.dot(self.pT, self.V.T)
_x = np.dot(_x, x)
min_cost = self.cost_function_i(_x, self.cluster_weights[0])
closest_cluster = 0
for i in range(1, self.n_clusters):
cost = self.cost_function_i(_x, self.cluster_weights[i])
if cost < min_cost:
min_cost = cost
closest_cluster = i
self.clusters[closest_cluster].append(x)
self.labels_.append(closest_cluster)
self.cluster_errors[closest_cluster] = self.cluster_errors[closest_cluster] + min_cost
# UPDATE STEP
for i in range(0, self.n_clusters):
self.cluster_means[i] = self.calculate_dataset_mean(self.clusters[i])
self.scatter_mx[i] = self.calculate_scatter_matrix(np.array(self.clusters[i]))
# EIGENDECOMPOSITION
E, self.V = self.eig()
self.m = self.updateM(E)
# CONVERGENCE
if self.convergence():
break
# Infinite loop termination
iterations = iterations + 1
if iterations > 9999:
print("SubKMeans: Too many iterations, aborting clustering...")
break
self.labels_ = np.array(self.labels_)
return self