-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-01-19--seq-plotting.py
executable file
·182 lines (140 loc) · 5.38 KB
/
2023-01-19--seq-plotting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python3
# First:
# scp 'REMOTE:kmer-egd/SEQUENCE.*' .
import sys
from collections import defaultdict, Counter
# ex: tomato.brown.rugose.nt.seq longest-timeseries.tsv
sequence, metadata = sys.argv[1:]
K=40
def rc(s):
return "".join({'T':'A', 'G':'C', 'A':'T', 'C':'G', 'N':'N'}[x]
for x in reversed(s))
metadatas = defaultdict(list)
with open(metadata) as inf:
for line in inf:
accession, date, wtp = line.strip().split("\t")
metadatas[wtp].append(date)
wtps = list(sorted(metadatas))
# Locations are relative to the forward sequence, and are counted from the end
# of the kmer closest to the start of the sequence.
loc = {} # kmer -> loc
with open(sequence) as inf:
line, = inf
seq = line.strip()
n_kmers = len(seq) - K + 1
for i in range(n_kmers):
kmer_in = seq[i:i+K]
loc[kmer_in] = i
loc[rc(kmer_in)] = i
# loc -> wtp -> [count1, count2, ...]
loc_vals = defaultdict(lambda: defaultdict(Counter))
for wtp in wtps:
for prefix in "ACGT":
with open("%s.%s.%s" % (sequence, wtp, prefix)) as inf:
for line in inf:
kmer, *counts = line.strip().split("\t")
record = loc_vals[loc[kmer]][wtp]
for i, count in enumerate(counts):
record[i] += int(count)
cols = {}
wtp_cols = {}
for wtp, dates in sorted(metadatas.items()):
for date_index, date in enumerate(dates):
wtp_cols[wtp, date] = [
loc_vals[i][wtp][date_index]
for i in range(max(loc_vals))]
if wtp not in cols:
cols[wtp] = [0]*max(loc_vals)
for i in range(max(loc_vals)):
cols[wtp][i] += loc_vals[i][wtp][date_index]
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.ticker as mtick
colors = {'HTP': 'b', 'JWPCP': 'g', 'OC': 'r', 'SJ': 'c'}
style = {
'HTP': 'solid',
'JWPCP': 'dashed',
'OC': 'dashdot',
'SJ': 'dotted',
}
fig, ax = plt.subplots()
global_mean_col = [0]*n_kmers
for wtp, col in sorted(cols.items()):
loc_percentages = [x / (n_kmers-1) * 100 for x in range(len(col))]
adj_col = [x/sum(col)*len(col) for x in col]
for i, x in enumerate(adj_col):
global_mean_col[i] += x
ax.plot(loc_percentages, adj_col,
label=wtp,
linewidth=0.75,
color=mcolors.BASE_COLORS[colors[wtp]],
linestyle=style[wtp])
global_mean_col = [x/len(wtps) for x in global_mean_col]
ax.set_ylabel("tbrv kmer relative abundance, average of each site's samples")
ax.set_xlabel("position along genome")
ax.set_title("per-wtp tbrv relative abundance by position")
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend()
fig.savefig("tbrv-wtp-averages.png", dpi=180)
plt.clf()
fig, ax = plt.subplots()
wtp_mean_cols = {}
for wtp, col in sorted(cols.items()):
loc_percentages = [x / (n_kmers-1) * 100 for x in range(len(col))]
adj_col = [x/sum(col)*len(col) for x in col]
wtp_mean_cols[wtp] = adj_col
adj_col = [x / global_mean_col[i] for i, x in enumerate(adj_col)]
ax.plot(loc_percentages, adj_col,
label=wtp,
linewidth=0.75,
color=mcolors.BASE_COLORS[colors[wtp]],
linestyle=style[wtp])
ax.set_ylabel("tbrv kmer normalized abundance, site mean vs global mean")
ax.set_xlabel("position along genome")
ax.set_title("per-wtp tbrv normalized by position")
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend()
fig.savefig("tbrv-wtp-normalized.png", dpi=180)
for wtp in wtps:
plt.clf()
fig, ax = plt.subplots()
ax.set_title("%s tbrv relative abundance by position" % wtp)
for (wtp_2, date), col in sorted(wtp_cols.items()):
if wtp_2 != wtp: continue
loc_percentages = [x / (n_kmers-1) * 100 for x in range(len(col))]
adj_col = [x/sum(col)*len(col) for x in col]
ax.scatter(loc_percentages,
adj_col,
label=date,
marker=',',
#color=mcolors.BASE_COLORS[colors[wtp]],
s=[0.01]*len(adj_col),
)
ax.set_xlabel("position along genome")
ax.set_ylabel("tbrv kmer relative abundance")
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(markerscale=25)
fig.savefig("tbrv-%s-abundance.png" % wtp, dpi=180)
for wtp in wtps:
plt.clf()
fig, ax = plt.subplots()
ax.set_title("%s tbrv normalized abundance by position" % wtp)
for (wtp_2, date), col in sorted(wtp_cols.items()):
if wtp_2 != wtp: continue
loc_percentages = [x / (n_kmers-1) * 100 for x in range(len(col))]
adj_col = [x/sum(col)*len(col) for x in col]
adj_col = [x / wtp_mean_cols[wtp][i]
if wtp_mean_cols[wtp][i] > 0 else 0
for i, x in enumerate(adj_col)]
ax.scatter(loc_percentages,
adj_col,
label=date,
marker=',',
s=[0.01]*len(adj_col),
)
ax.set_xlabel("position along genome")
ax.set_ylabel("tbrv kmer normalized (delta squared), sample vs site mean")
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(markerscale=25, ncol=3, loc='lower center')
ax.set_ylim(ymax=3, ymin=-1)
fig.savefig("tbrv-%s-normalized.png" % wtp, dpi=180)