-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-02-09--plot-read-contig-start-variance.py
executable file
·94 lines (75 loc) · 2.54 KB
/
2023-02-09--plot-read-contig-start-variance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
import sys
import glob
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
samples = {} # accession_{1,2} -> vid -> f/r + loc -> count
for fname in sys.argv[1:]:
sample, _ = fname.split("_")
with open(fname) as inf:
samples[sample] = json.load(inf)
all_vids = set()
for vids in samples.values():
all_vids.update(vids)
all_vids = list(sorted(all_vids))
def to_slug(vid):
return vid.split(".")[0]
max_locs = {
'f': Counter(), # vid -> max_loc
'r': Counter(), # vid -> max_loc
}
for sample in samples:
for vid in all_vids:
for frloc, count in samples[sample][vid].items():
fr, loc = frloc[0], int(frloc[1:])
max_locs[fr][vid] = max(max_locs[fr][vid], loc)
dir_sample_sums = {
'f': {}, # vid -> [sample1 fsum, sample2 fsum, ...]
'r': {}, # vid -> [sample1 fsum, sample2 fsum, ...]
}
for vid in all_vids:
for direction in ['f', 'r']:
dir_sample_sums[direction][vid] = np.array([
sum(
count
for frloc, count in samples[sample][vid].items()
if frloc.startswith(direction)) + 1
for sample in sorted(samples)])
def get_variance(vid, direction):
xs = []
ys = []
for loc in range(max_locs[direction][vid] + 1):
frloc = "%s%s" % (direction, loc)
vals = np.array([samples[sample][vid].get(frloc, 0)
for sample in sorted(samples)])
normalized = vals / dir_sample_sums[direction][vid]
mean = np.mean(normalized)
errors = normalized - mean
squared_errors = errors**2
unbiased_variance = sum(squared_errors) / (len(vals)-1)
xs.append(loc)
ys.append(unbiased_variance)
return xs, ys
fig, axs = plt.subplots(nrows=3,
ncols=3,
#sharex=True,
sharey=True,
figsize=(12, 12),
constrained_layout=True)
plt.suptitle("variance in read starts by location")
fig.supxlabel("position along genome")
fig.supylabel("variance")
for row, vid in enumerate(all_vids):
slug = to_slug(vid)
ax = axs[row%3][row//3]
for direction in ['f', 'r']:
xs, ys = get_variance(vid, direction)
ax.set_yscale('log')
ax.scatter(xs, ys, label=direction, marker=',', s=0.5)
ax.legend()
ax.set_title("%s" % slug)
fig.savefig("read-start-variance.png", dpi=180)
plt.clf()