-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-02-13--plot-top-virus-correlations.py
executable file
·116 lines (89 loc) · 3.41 KB
/
2023-02-13--plot-top-virus-correlations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import sys
import json
import datetime
import itertools
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
# accession -> date, site, nreads
metadata = {}
with open("longest-timeseries-nreads.tsv") as inf:
for line in inf:
accession, date, site, nreads = line.strip().split()
metadata[accession] = (
datetime.datetime.fromisoformat(date), site, int(nreads))
accession_counts = {} # accession -> vid -> count
for accession in metadata:
with open("2023-02-07--top-counts/%s.json" % accession) as inf:
accession_counts[accession] = json.load(inf)
all_vids = set()
for vids in accession_counts.values():
all_vids.update(vids)
all_vids = list(sorted(all_vids))
# sample -> n_reads
sample_readcounts = {}
def to_slug(vid):
return vid.split(".")[0]
first_date, *_, last_date = sorted(
date for (date, _, _) in metadata.values())
all_sites = sorted(set(
site for (_, site, _) in metadata.values()))
first_site, *_, last_site = all_sites
def to_date_color(accession):
date, _, _ = metadata[accession]
return (last_date - date) / (last_date - first_date)
site_colors=cm.rainbow([i/(len(all_sites)-1) for i in range(len(all_sites))])
def to_site_color(accession):
_, site, _ = metadata[accession]
return site_colors[all_sites.index(site)]
fig, ax = plt.subplots(nrows=len(all_vids),
ncols=len(all_vids),
#sharex=True, sharey=True,
figsize=(2.5*len(all_vids), 2*len(all_vids)),
constrained_layout=True)
def to_relative_abundance(n, accession):
_, _, total_reads = metadata[accession]
return n/total_reads
plt.suptitle("per-sample abundance correlations, date-colored")
fig.supxlabel("relative abundance")
fig.supylabel("relative abundance")
for row, vid1 in enumerate(all_vids):
slug1 = to_slug(vid1)
for col, vid2 in enumerate(all_vids):
slug2 = to_slug(vid2)
a = ax[row, col]
xs = []
ys = []
cs = []
for accession, counts in accession_counts.items():
xs.append(to_relative_abundance(counts[vid1], accession))
ys.append(to_relative_abundance(counts[vid2], accession))
cs.append(to_date_color(accession))
a.scatter(xs, ys, c=cs)
a.set_title("%s-%s" % (slug1, slug2))
fig.savefig("corr-norm-date.png", dpi=180)
plt.clf()
fig, ax = plt.subplots(nrows=len(all_vids),
ncols=len(all_vids),
#sharex=True, sharey=True,
figsize=(2.5*len(all_vids), 2*len(all_vids)),
constrained_layout=True)
plt.suptitle("per-sample abundance correlations, site-colored")
fig.supxlabel("relative abundance")
fig.supylabel("relative abundance")
for row, vid1 in enumerate(all_vids):
slug1 = to_slug(vid1)
for col, vid2 in enumerate(all_vids):
slug2 = to_slug(vid2)
a = ax[row, col]
xs = []
ys = []
cs = []
for accession, counts in accession_counts.items():
xs.append(to_relative_abundance(counts[vid1], accession))
ys.append(to_relative_abundance(counts[vid2], accession))
cs.append(to_site_color(accession))
a.scatter(xs, ys, c=cs)
a.set_title("%s-%s" % (slug1, slug2))
fig.savefig("corr-norm-site.png", dpi=180)