-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-01-31--overlay-accuracy-by-location.py
executable file
·69 lines (57 loc) · 2 KB
/
2023-01-31--overlay-accuracy-by-location.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
import sys
import glob
slugs = sys.argv[1:]
viruses = {}
for slug in slugs:
d = {
"assembled": {},
"genbank_top": {},
"genbank_closest": {},
}
d["assembled"]["fasta"], = glob.glob("%s.*.assembled.fasta" % slug)
d["genbank_top"]["fasta"] = d["assembled"]["fasta"].replace(
".assembled.", ".")
d["genbank_closest"]["fasta"], = (
set(glob.glob("%s.*.fasta" % slug)) -
set([d["assembled"]["fasta"], d["genbank_top"]["fasta"]]))
for k in d.values():
k["loc_full"] = []
k["loc_any"] = []
k["loc_rc"] = []
for fname in glob.glob(
"reads-%s/*.locs.tsv" % k["fasta"].replace(".fasta", "")):
with open(fname) as inf:
for line in inf:
loc, obs_full, obs_any, obs_rc = [
int(x) for x in line.strip().split()]
if loc >= len(k["loc_full"]):
k["loc_full"].append(0)
k["loc_any"].append(0)
k["loc_rc"].append(0)
k["loc_full"][loc] += obs_full
k["loc_any"][loc] += obs_any
k["loc_rc"][loc] += obs_rc
viruses[slug] = d
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
labels = [
("loc_full", "exact matches only"),
("loc_any", "any k-mer matches"),
("loc_rc", "whole reads where any k-mer matches"),
]
plt.close()
plt.clf()
fig, ax = plt.subplots()
tag="loc_full"
ax.set_ylabel("reads")
ax.set_xlabel("position along genome")
ax.set_title("reads by location (exact match)")
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
for i, (slug, d) in enumerate(sorted(viruses.items())):
k = d["assembled"]
loc_percentages = [x / len(k[tag]) * 100
for x in range(len(k[tag]))]
ax.plot(loc_percentages, k[tag], label=slug.upper())
ax.legend()
fig.savefig("overlay-%s.png" % "-".join(sorted(viruses)), dpi=180)