-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathgreedy_clustering.py
101 lines (85 loc) · 3.12 KB
/
greedy_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Given the output of correlations.py that calculates pairwise pearson correlations, generate all
possible clusters in a leader clustering approach
"""
import os
import sys
import argparse
import numpy as np
from typing import List, Any, Dict, Set
def merge_clusters(clx, mmx, x, y, verbose=False):
"""
Merge the clusters
:param verbose: more output
:type verbose: bool
:param y: second member
:type y: str
:param x: first member
:type x: str
:param mmx: For a cluster ID, a list of the members
:type mmx: Dict[int, Set[str]]
:param clx: For the member of a cluster, the cluster number
:type clx: dict[str, int]
:rtype: dict[str, int], Dict[int, Set[str]]
"""
if verbose:
sys.stderr.write(f"Merging clusters At {x} and {y}: There are {len(clx)} members and {len(mmx)} clusters\n")
xc = clx[x]
yc = clx[y]
if xc == yc:
# nothing to do!
return clx, mmx
for m in mmx[yc]:
clx[m] = xc
mmx[xc].add(m)
del mmx[yc]
return clx, mmx
def cluster(inputfile, threshold, verbose=False):
"""
Calculate the clusters
:param str inputfile: the input file
:param float threshold: the threshold for being included in a cluster
:param verbose: more output
:return: the clusters and their members
:rtype: dict[str, int], Dict[int, Set[str]]
"""
clusters: Dict[str, int] = {}
members: Dict[int, Set[str]] = {}
cluster_number: int = 0
with open(inputfile, 'r') as f:
for li in f:
p = li.strip().split("\t")
if p[2] == 'nan' or np.isnan(float(p[2])):
continue
if float(p[2]) < threshold:
continue
# is either p[0] or p[1] in a cluster?
if p[0] in clusters and p[1] in clusters:
if clusters[p[0]] != clusters[p[1]]:
clusters, members = merge_clusters(clusters, members, p[0], p[1], verbose)
elif p[0] in clusters:
xc = clusters[p[0]]
clusters[p[1]] = xc
members[xc].add(p[1])
elif p[1] in clusters:
xc = clusters[p[1]]
clusters[p[0]] = xc
members[xc].add(p[0])
else:
cluster_number += 1
clusters[p[0]] = cluster_number
clusters[p[1]] = cluster_number
members[cluster_number] = {p[0], p[1]}
return clusters, members
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=' ')
parser.add_argument('-f', '--file', help='input file', required=True)
parser.add_argument('-t', '--threshold', help='threshold', type=float, required=True)
parser.add_argument('-o', '--output', help='output file', required=True)
parser.add_argument('-v', '--verbose', help='verbose output', action='store_true')
args = parser.parse_args()
clusters, members = cluster(args.file, args.threshold, args.verbose)
with open(args.output, 'w') as out:
for m in members:
mems = "\t".join(members[m])
out.write(f"{m} ({len(members[m])})\t{mems}\n")