-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-05-16--process-settings-rothman.py
executable file
·118 lines (96 loc) · 3.21 KB
/
2023-05-16--process-settings-rothman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
import glob
import os.path
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
with open(os.path.expanduser(
"~/code/mgs-pipeline/dashboard/metadata_samples.json")) as inf:
sample_metadata = json.load(inf)
overall = {}
by_site = defaultdict(dict)
# site -> date -> discard_fraction
discard_fraction_by_date_and_site = defaultdict(dict)
for settings in glob.glob("*.settings"):
sample, _ = os.path.splitext(settings)
if sample_metadata[sample]["enrichment"] == "panel": continue
in_length_distribution = False
cols = None
data = []
with open(settings) as inf:
for line in inf:
if line.startswith("[Length distribution]"):
in_length_distribution = True
continue
if not in_length_distribution:
continue
row = line.strip().split("\t")
if not cols:
cols = row
continue
data.append([int(x) for x in row])
site = sample_metadata[sample]["fine_location"]
date = sample_metadata[sample]["date"]
fig, ax = plt.subplots(constrained_layout=True)
xs = [r[0] for r in data]
ys = [r[-4] for r in data]
ax.plot(xs, ys)
ax.set_title("Lengths: %s %s" % (site, date))
fig.savefig("%s.collapsed.lengths.png" % sample, dpi=180)
plt.clf()
plt.close()
fig, ax = plt.subplots(constrained_layout=True)
xs = [r[0] for r in data]
ys = [r[1] for r in data]
ax.plot(xs, ys)
ax.set_title("Lengths: %s %s" % (site, date))
fig.savefig("%s.mate1.lengths.png" % sample, dpi=180)
plt.clf()
plt.close()
total_all = 0
total_discarded = 0
for length, *rest in data:
if length not in overall:
overall[length] = rest
else:
for i in range(len(rest)):
overall[length][i] += rest[i]
if length not in by_site[site]:
by_site[site][length] = rest
else:
for i in range(len(rest)):
by_site[site][length][i] += rest[i]
total_all += rest[-1]
total_discarded += rest[-2]
discard_fraction_by_date_and_site[site][date] = total_discarded / total_all
fig, ax = plt.subplots(constrained_layout=True)
ax.set_title("Discard fraction by site")
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
for site in sorted(discard_fraction_by_date_and_site):
xs = []
ys = []
for date in sorted(discard_fraction_by_date_and_site[site]):
xs.append(date)
ys.append(discard_fraction_by_date_and_site[site][date] * 100)
ax.scatter(xs, ys, label="%s" % (site))
ax.legend()
fig.savefig("discard-fraction-by-site-over-time.png", dpi=180)
plt.clf()
plt.close()
fig, ax = plt.subplots(constrained_layout=True)
ax.set_title("Lengths by Site")
ax.set_yscale("log")
for site in sorted(by_site):
xs = []
ys = []
for length, record in sorted(by_site[site].items()):
xs.append(length)
ys.append(record[-1])
total_y = sum(ys)
ys = [v/total_y for v in ys]
ax.plot(xs, ys, label="%s" % (site))
ax.legend()
fig.savefig("lengths-by-site.png", dpi=180)
plt.clf()
plt.close()