-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathdistances.py
86 lines (73 loc) · 2.43 KB
/
distances.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Given a file with columns, generate euclidean distance between all the columns of data
"""
import os
import sys
import argparse
from itertools import combinations
from scipy.spatial import distance
import matplotlib.pyplot as plt
def read_data(f, h, c):
"""
Read the file and store the data.
:param f: the file to read
:param h: whether the file contains headers
:param c: whether the first column is a label or data
:return:
"""
data = []
headers = []
firstline = True
start = 1
if c:
start = 0
with open(f,'r') as fin:
for l in fin:
p=l.strip().split("\t")
if firstline:
if h:
headers = p[start:]
for i in range(start, len(p)):
if not h:
headers.append(i)
data.append([])
firstline = False
continue
for i in range(start, len(p)):
data[i-start].append(float(p[i]))
return data, headers
def pairwise(data, headers):
"""
Calculate pairwise distances
:param data:
:param headers:
:return:
"""
cols = range(len(headers))
for i, j in combinations(cols, 2):
d = distance.euclidean(data[i], data[j])
print("{}\t{}\t{}".format(headers[i], headers[j], d))
def plot_pairs(data, headers):
cols = range(len(headers))
f, axarr = plt.subplots(2, 2)
pltx = 0
plty = 0
for i, j in combinations(cols, 2):
axarr[pltx, plty].plot(data[i], data[j], 'ro')
axarr[pltx, plty].set_title('{} versus {}'.format(headers[i], headers[j]))
pltx += 1
if pltx == 2:
pltx = 0
plty = 1
plt.tight_layout()
plt.show()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Generate all pairwise correlations between the data")
parser.add_argument('-f', help='file of data with data in columns', required=True)
parser.add_argument('-l', help='first line is a header line (will be used in output)', action='store_true')
parser.add_argument('-c', help='first column is data and should be included (default: the first columns is labels that are discarded)', action='store_true')
parser.add_argument('-v', help='verbose output')
args = parser.parse_args()
data, headers = read_data(args.f, args.l, args.c)
pairwise(data, headers)
plot_pairs(data, headers)