-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-02-07--plot-top-virus-correlations-to-average.py
executable file
·123 lines (92 loc) · 3.04 KB
/
2023-02-07--plot-top-virus-correlations-to-average.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
import sys
import json
import datetime
import itertools
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
# accession -> date, site
metadata = {}
with open("longest-timeseries.tsv") as inf:
for line in inf:
accession, date, site = line.strip().split()
metadata[accession] = (
datetime.datetime.fromisoformat(date), site)
accession_counts = {} # accession -> vid -> count
for accession in metadata:
with open("2023-02-07--top-counts/%s.json" % accession) as inf:
accession_counts[accession] = json.load(inf)
all_vids = set()
for vids in accession_counts.values():
all_vids.update(vids)
all_vids = list(sorted(all_vids))
def to_slug(vid):
return vid.split(".")[0]
first_date, *_, last_date = sorted(
date for (date, _) in metadata.values())
all_sites = sorted(set(
site for (_, site) in metadata.values()))
first_site, *_, last_site = all_sites
def to_date_color(accession):
date, _ = metadata[accession]
return (last_date - date) / (last_date - first_date)
site_colors=cm.rainbow([i/(len(all_sites)-1) for i in range(len(all_sites))])
def to_site_color(accession):
_, site = metadata[accession]
return site_colors[all_sites.index(site)]
fig, ax = plt.subplots(nrows=3,
ncols=3,
figsize=(9,9),
constrained_layout=True)
plt.suptitle("per-sample abundance vs average abundance")
fig.supxlabel("mean abundance of other viruses")
fig.supylabel("abundance of this virus")
for i, vid1 in enumerate(all_vids):
slug = to_slug(vid1)
xs = []
ys = []
cs = []
for accession, counts in accession_counts.items():
ys.append(counts[vid1])
total = 0
count = 0
for vid2 in all_vids:
if vid1 == vid2:
continue
total += counts[vid2]
count += 1
xs.append(total / count)
cs.append(to_site_color(accession))
a = ax[i//3][i%3]
a.scatter(xs, ys, c=cs)
a.set_title("%s" % (slug))
fig.savefig("corr-avg.png", dpi=180)
plt.clf()
fig, ax = plt.subplots(nrows=3,
ncols=3,
figsize=(9,9),
constrained_layout=True)
plt.suptitle("per-sample abundance vs geometric mean abundance")
fig.supxlabel("geometric mean abundance of other viruses")
fig.supylabel("abundance of this virus")
for i, vid1 in enumerate(all_vids):
slug = to_slug(vid1)
xs = []
ys = []
cs = []
for accession, counts in accession_counts.items():
ys.append(counts[vid1])
product = 1
count = 0
for vid2 in all_vids:
if vid1 == vid2:
continue
product *= counts[vid2]
count += 1
xs.append(product ** (1/count))
cs.append(to_site_color(accession))
a = ax[i//3][i%3]
a.scatter(xs, ys, c=cs)
a.set_title("%s" % (slug))
fig.savefig("corr-gavg.png", dpi=180)