forked from levon003/python-cure-implementation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cure.py
317 lines (287 loc) · 13.4 KB
/
cure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# cure.py
# by Freddy Stein and Zach Levonian
# Written as a final project for CS324 - Data Mining
# This file loads the pickled author data and creates clusters from it.
import sys, math
from pickleCreator import *
from kMeansAuthors import *
import itertools
# Percentage of the authors to use in the initial clustering
PRELIM_DATA_PERCENTAGE = 0.15
# Percentage of each preliminary cluster to use as representative points
REPRESENTATIVE_POINTS_PERCENTAGE = 0.005
# Percentage of distance towards cluster centroid each representative point should travel
CENTROID_MIGRATION_PERCENTAGE = 0.20
# Min distance between CURE clusters without merging
CLUSTER_MERGE_DISTANCE = 0.02
# This class describes the conceptual clusters used in the CURE algorithm,
# and contains list to contain the points (Authors) within it as well
# as the representative points.
class CureCluster:
def __init__(self, id__, center__):
self.id = id__
self.authors = []
self.repPoints = []
self.center = center__
def __repr__(self):
return "Cluster #" + str(self.id) + " Size: " + str(len(self.authors))
def addAuthor(self, user):
self.authors.append(user)
# Computes and stores the centroid of this cluster, based on its authors
def computeCentroid(self):
sumPoints = [0 for i in range(6)]
for author in self.authors:
addToList(sumPoints, author.getData())
totalPoints = len(self.authors)
average = [ sumPoints[i] / totalPoints for i in range(6)]
self.center = average
# Computes and stores representative points for this cluster, based on its
# center and the fixed percentage of points to choose.
def computeRepPoints(self):
# Choose the first rep point to be the point furthest point from the "center"
farthestAuthor = None
farthestDistance = -1
for author in self.authors:
dist = clustering.getEucSquaredDistance(author.getData(), self.center)
if dist > farthestDistance:
farthestAuthor = author
farthestDistance = dist
self.repPoints.append(farthestAuthor.getData())
numPointsToChoose = int(math.floor(len(self.authors) * REPRESENTATIVE_POINTS_PERCENTAGE))
# Keep adding points that maximize total distance from each other
while len(self.repPoints) < numPointsToChoose:
farthestAuthor = None
farthestTotalDistance = -1
for author in self.authors:
totalDist = 0
for rep in self.repPoints:
totalDist += clustering.getEucSquaredDistance(author.getData(), rep)
if totalDist > farthestTotalDistance:
farthestTotalDistance = totalDist
farthestAuthor = author
self.repPoints.append(farthestAuthor.getData())
# Migrates each representative point a fixed percentage towards
# the centroid of the cluster
def moveRepPoints(self):
for repPoint in self.repPoints:
for i in range(len(repPoint)):
distToCenter = math.sqrt((repPoint[i] - self.center[i])**2)
moveDist = distToCenter * CENTROID_MIGRATION_PERCENTAGE
if repPoint[i] < self.center[i]:
repPoint[i] += moveDist
elif repPoint[i] > self.center[i]:
repPoint[i] -= moveDist
# Merges this cluster with the given clust, recomputing the centroid
# and the representative points
def mergeWithCluster(self, clust):
for author in clust.authors:
self.addAuthor(author)
self.computeCentroid()
self.computeRepPoints()
self.moveRepPoints()
####
# Coordinates the running of the CURE algorithm, calling the relevant functions
# and ultimately returning the clusters.
# Input: The dictionary of authors, and the number of clusters to create. (authors, k)
# Output: The clusters of authors, as created by the CURE clustering method (clusters)
def runCURE(authors, k):
print "Standardizing author data."
authors = standardizeAuthors(authors)
print "Data standardized. Running preliminary clustering with k=" + str(k) + "."
prelimClusters, centers, smallAuthors = prelimClustering(k, authors)
print "Preliminary clustering complete. Building cure clusters."
clusters = buildCureClusters(prelimClusters, centers, authors, k)
print "Clusters initialized; choosing representative points."
clusters = generateRepresentativePoints(clusters)
print "Representative points chosen. Merging close clusters."
clusters = mergeCloseClusters(clusters)
print "Merging complete. " + str(len(clusters)) + " clusters remain."
print "Assigning remaining data."
clusters = assignRemainingData(clusters, authors, smallAuthors)
print "All points assigned. CURE complete."
return clusters
# Assigns all authors that weren't added via the preliminary clustering
# to an existing cluster based upon the nearest representative point.
# Input: the list of clusters, the dictionary of authors, and the dictionary of authors
# involved in the initial clustering.
# Output: An updated list of clusters, which now contains all the authors in them. (clusters)
def assignRemainingData(clusters, authors, smallAuthors):
for authorId in authors:
if authorId not in smallAuthors:
clust = getClosestCluster(authors[authorId], clusters)
clust.addAuthor(authors[authorId])
return clusters
# Helper function for assignRemainingData()
# Determines the cluster associated with the representative point closest
# to the given author.
# Input: a given author of class Author, and the list of clusters (author, clusters)
# Output: the clostest cluster for the author, based on repPoints (clustChoice)
def getClosestCluster(author, clusters):
clustChoice = None
minDist = 99999
authorData = author.getData()
for cluster in clusters:
for repPoint in cluster.repPoints:
dist = clustering.getEucSquaredDistance(authorData, repPoint)
if dist < minDist:
minDist = dist
clustChoice = cluster
return clustChoice
# Attempts to merge clusters based on the distance between their closest
# reprsentative points; may result in cluster deletion.
# Input: the list of all the cluster (clusters)
# Output: the list of clusters, which may have had clusters merged together. (clusters)
def mergeCloseClusters(clusters):
clustersMerged = False
while clustersMerged:
for i in range(len(clusters)):
for j in range(i, len(clusters)):
closestDist = getClosestClusterDist(clusters[i], clusters[j])
if closestDist < CLUSTER_MERGE_DISTANCE:
clusters[i].mergeWithCluster(clusters[j])
del clusters[j]
clustersMerged = True
break
if clustersMerged:
clustersMerged = False
break
return clusters
# Helper function for mergeCloseClusters()
# Determines the closest distance between any two representative points for two clusters
# Input: Two clusters of type CureCluster (clust1, clust2)
# Output: the closest distance between any two representative points in the clusters (minDist)
def getClosestClusterDist(clust1, clust2):
minDist = 9999
for repPoint1 in clust1.repPoints:
for repPoint2 in clust2.repPoints:
dist = clustering.getEucSquaredDistance(repPoint1, repPoint2)
if dist < minDist:
minDist = dist
return minDist
# For each CURE cluster, computes the representative points
# Input: the list of clusters (clusters)
# Output: the list of clusters, but the clusters now have representative points associated
# with themselves (clusters)
def generateRepresentativePoints(clusters):
i = 0
for cluster in clusters:
cluster.computeRepPoints()
cluster.moveRepPoints()
i += 1
print " Cluster " + str(i) + " complete."
return clusters
# Generates initial CURE clusters from the preliminary clusters
# Input: the preliminary Clusters created by k-means, the centers for those clusters,
# the dictionary of authors, and the number of clusters to create.
# (prelimClusters, center, authors, k)
# Output: the new list of clusters, which is a list of CureClusters (clusters)
def buildCureClusters(prelimClusters, centers, authors, k):
clusters = []
for i in range(k):
newClust = CureCluster(i, centers[i])
clusters.append(newClust)
for id in prelimClusters:
clusters[prelimClusters[id]].addAuthor(authors[id])
return clusters
# Performs an initial k-Means clustering on a percentage of the dataset
# to give us some cluster assignments to refine with CURE.
# Input: the number of clusters to create, and the dictionary of authors (k, authors)
# Output: the dictionary of clusters, the centers for those clusters, and the dictionary
# of authors used to make the initial clusters. (clusters, centers, smallAuthors)
def prelimClustering(k, authors):
smallAuthors = dict([])
numEntries = int(math.floor(len(authors) * PRELIM_DATA_PERCENTAGE))
print "Using " + str(numEntries) + " authors as the preliminary dataset."
i = 0
for id in authors:
smallAuthors[id] = authors[id]
i += 1
if i > numEntries:
break
print "Computing initial cluster centers."
initialCenters = getInitialCenters(k, smallAuthors)
print "Centers chosen. Running kMeans."
clusters, centers = kMeans(smallAuthors, initialCenters)
print "kMeans complete; clusters found."
return clusters, centers, smallAuthors
####
# Given a dictionary of authors, regenerates the orignal values in the feature vector
# Using the original stored value of each feature.
# Input: the dictionary of authors (authors)
# Output: the same dictionary, but with the original values (authors)
def destandardizeAuthors(authors):
for authorId in authors:
authors[authorId].buildRepList()
return authors
# Standardizes each feature by putting it on a 0 to 1 scale
# Actually, fits it on a 0.01-1.01 scale, so we don't need to worry about 0s.
# We chose to standardize in this way because we needed a similar range to
# account for the huge difference between, for example, the year (1994) and
# the number of journals (2). Log standardization wouldn't account for that.
# http://www.biomedware.com/files/documentation/boundaryseer/Preparing_data/Methods_for_data_standardization.htm
# Input: the dictionary of authors (authors)
# Output: the dictionary of authors, but each author has been standardized via the method
# described in the link (authors)
def standardizeAuthors(authors):
maxs, mins = getMaxsAndMins(authors)
for authorId in authors:
author = authors[authorId].getData()
for i in range(len(author)):
author[i] = (float(author[i] - mins[i]) / (maxs[i] - mins[i]) ) + 0.01
return authors
# Given a series of clusters of data standardized by the original function,
# finds the original values by reversing the calculation
# Input: the list of CureClusters and the dictionary of authors. (clusters, authors)
# Output: The list of destandardized clusters. Also, authors is destandardized as well.
# (clusters)
def destandardizeClusterCenters(clusters, authors):
destandardizeAuthors(authors)
maxs, mins = getMaxsAndMins(authors)
for cluster in clusters:
for i in range(len(cluster.center)):
cluster.center[i] = ((cluster.center[i] - 0.01) * (maxs[i] - mins[i])) + mins[i]
return clusters
# Determines the distance from each author to its cluster center. This uses the standardized
# values. Prints both the individual error, and the total error.
# Input: the list of the CureClusters (clusters)
# Output: none, but print the error of the clusters
def determineClustError(clusters):
j = 1
aggError = 0
for cluster in clusters:
totalDist = 0
for author in cluster.authors:
for i in range(len(author.repList)):
totalDist += math.sqrt((author.repList[i] - cluster.center[i])**2)
print "Total Clustering Error for Cluster #" + str(j) + ": " + str(totalDist)
j += 1
aggError += totalDist
print "The total Clustering Error for all Clusters is: " + str(aggError)
# A helper function which prints the clusters in a way which is readable, and gives
# information about them. Also destandardizes them for readability purposes.
# Input: the list of CureClusters, and the dictionary of authors. (clusters, authors)
# Output: none, but prints information
def printClusters(clusters, authors):
clusters = destandardizeClusterCenters(clusters, authors)
i = 1
for cluster in clusters:
print "Cluster " + str(i) + ":"
print "\tCentroid: " + str(cluster.center)
print "\tNum Authors: " + str(len(cluster.authors))
i += 1
def main():
k = 5
if (len(sys.argv) != 2):
print "Usage: pypy cure.py k"
print "Continuing with k = 5"
else:
k = int(sys.argv[1])
print "Attempting to load author data pickle."
authors = getAuthorsPickle("authorsFull.p")
# Can also load up authorsSmall.p for a faster runtime
print str(len(authors)) + " authors in dataset."
clusters = runCURE(authors, k)
determineClustError(clusters)
printClusters(clusters, authors)
if __name__ == '__main__':
main()