-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmedoids.py
89 lines (69 loc) · 2.54 KB
/
kmedoids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 11 12:01:59 2017
@author: Leila Jamshidian Sales
"""
NUMBER_OF_CLUSTERS = 3
import numpy as np
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
if k > n:
raise Exception('too many medoids')
# randomly initialize an array of k medoid indices
M = np.arange(n)
np.random.shuffle(M)
M = np.sort(M[:k])
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in range(tmax): #tmax is the number of iterations of the algorithm performed
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C
#Importing precomputed Distance matrix
import pickle
distance_matrix = pickle.load( open( "C:/Users/grover/.spyder-py3/save.p", "rb" ) )
print ('The size of distance matrix is ',distance_matrix.shape)
#Calculating the clusters
import time
time_start = time.clock()
M, C = kMedoids(distance_matrix, NUMBER_OF_CLUSTERS)
print ('time taken: ', time.clock() - time_start)
#Reading the names of Protiens
import csv
with open("C:/Users/grover/Downloads/data.csv") as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
data = []
names = []
for row in readCSV:
if row:
names.append(row[0])
data.append(row[3])
# Showing the Results
for i in range(len(M)):
print( 'The mediod ',i,' is ',names[M[i]] )
from algo61 import greedyEditdistance
for i in range(len(M)):
for j in range(i+1,len(M)):
print ('The (greedy)Edit distance between medoids ',i, 'and',j,' is ',greedyEditdistance(data[M[i]],data[M[j]]), ' while their lengths are ', len(data[M[i]]) , ' and ',len(data[M[j]]) )
print ('This proves that the found clusters are significantly different from each other')