-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathclustering.py
62 lines (50 loc) · 1.83 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Cluster genes based on %id with cutoffs
"""
import os
import sys
import argparse
import scipy
import scipy.cluster.hierarchy as sch
def parse_text_file(tf):
"""
Parse a text file and return an n-choose-2 array of the elements. The array returned has the distance from the first
element to all other elements, and then the second element to n-1 elements (all but the first), and then the
third element to n-2 elements (all but the first & second) and so on.
:param tf: Text file with [a, b, distance]
:type tf: str
:return: n-choose-2 array of the data.
:rtype: array
"""
data = {}
ks = set()
with open(tf, 'r') as fin:
for l in fin:
p=l.strip().split("\t")
ks.add(p[0])
ks.add(p[1])
if p[0] not in data:
data[p[0]]={}
if p[1] not in data:
data[p[1]] = {}
data[p[0]][p[1]] = float(p[2])/100
data[p[1]][p[0]] = float(p[2])/100
allkeys = list(ks)
allkeys.sort()
nct = []
for i in range(len(allkeys)):
for j in range(i+1, len(allkeys)):
nct.append(data[allkeys[i]][allkeys[j]])
return nct
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Cluster genes based on %id with cutoffs")
parser.add_argument('-t', help='file with [a, b, distance] separated by tabs', required=True)
parser.add_argument('-o', help='clusters output file name. We print them out in json format', required=True)
args = parser.parse_args()
matrix = parse_text_file(args.t)
L = sch.linkage(matrix, method='complete')
out = open(args.o, 'w')
for i in range(101):
ind = sch.fcluster(L, i/100.0, 'distance')
out.write("{" + str(i) + " : " + str(ind) + "},\n")
print("{}\t{}".format(100-i, max(ind)))