-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-05-08--spurbeck-EFGH-sites-comparison.py
105 lines (83 loc) · 3.31 KB
/
2023-05-08--spurbeck-EFGH-sites-comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import gzip
import json
import subprocess
dashboard = os.path.expanduser("~/code/mgs-pipeline/dashboard/")
with open(os.path.join(dashboard, "human_virus_sample_counts.json")) as inf:
human_virus_sample_counts = json.load(inf)
with open(os.path.join(dashboard, "metadata_samples.json")) as inf:
metadata_samples = json.load(inf)
with open(os.path.join(dashboard, "metadata_bioprojects.json")) as inf:
metadata_bioprojects = json.load(inf)
with open(os.path.join(dashboard, "metadata_papers.json")) as inf:
metadata_papers = json.load(inf)
spurbeck_bioproject, = metadata_papers["Spurbeck 2023"]["projects"]
samples = metadata_bioprojects[spurbeck_bioproject]
target_cc_taxids = {
694003: "betacoronavirus 1",
95341: "sapovirus",
39733: "astroviridae",
694009: "sars-cov-2",
142786: "norovirus",
12234: "tobamovirus",
2559587: "other riboviria",
10239: "other viruses",
}
target_hv_taxids = {
2559587: "riboviria",
10239: "viruses"
}
# sample -> taxid -> count
ccs = {}
for sample in samples:
cladecounts = "%s.tsv.gz" % sample
if not os.path.exists(cladecounts):
subprocess.check_call(
["aws", "s3", "cp", "s3://nao-mgs/%s/cladecounts/%s" % (
spurbeck_bioproject, cladecounts), "."])
counts = {}
with gzip.open(cladecounts) as inf:
for line in inf:
taxid, _, _, clade_assignments, _ = line.strip().split()
taxid = int(taxid)
clade_assignments = int(clade_assignments)
if taxid in target_cc_taxids:
counts[taxid] = clade_assignments
ccs[sample] = counts
import matplotlib.pyplot as plt
sites = set(metadata_samples[sample]["fine_location"]
for sample in samples)
for site in list(sites):
if site not in "EFGH":
sites.remove(site)
markers=("d", "v", "s", "*", "^")
for i, taxid in enumerate(sorted(target_cc_taxids)):
fig, ax = plt.subplots(constrained_layout=True)
plt.suptitle("relative abundances by sampling site")
fig.supylabel("relative abundance of clade")
fig.supxlabel("date")
plt.xticks(rotation=45, ha='right')
ax.set_title(target_cc_taxids[taxid])
ax.set_yscale("log")
for site_index, site in enumerate(sorted(sites)):
xs = []
ys = []
for sample in samples:
if metadata_samples[sample]["fine_location"] == site:
xs.append(metadata_samples[sample]["date"])
val = ccs[sample].get(taxid, 0)
if target_cc_taxids[taxid] == "other riboviria":
for other_taxid, other_taxid_name in target_cc_taxids.items():
if other_taxid_name not in [
"other riboviria",
"other viruses"]:
val -= ccs[sample].get(other_taxid, 0)
if target_cc_taxids[taxid] == "other viruses":
val -= ccs[sample].get(2559587, 0) # riboviria
ys.append(val /
metadata_samples[sample]["reads"])
ax.scatter(xs, ys, label=site, marker=markers[site_index])
ax.legend()
fig.savefig("efgh-%s.png" % ( target_cc_taxids[taxid].replace(" ", "-")),
dpi=180)
plt.clf()