-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-07-14--check-targets-kraken-detectable.py
executable file
·151 lines (128 loc) · 4.84 KB
/
2023-07-14--check-targets-kraken-detectable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
# Run 2023-07-14--verify-all-targets-in-refseq.py first.
import os
import re
import gzip
import json
import subprocess
from collections import defaultdict
# output of p2ra/list_taxids.py
taxid_list="""
10298 hsv_1 HSV-1
10310 hsv_2 HSV-2
10359 cmv CMV
10376 ebv EBV
10407 hbv HBV
10566 hpv HPV
10632 jcv JCV
10804 aav2 AAV2
11103 hcv HCV
11320 influenza Influenza A
11520 influenza Influenza B
11676 hiv HIV
68558 aav6 AAV6
82300 aav5 AAV5
122928 norovirus Norovirus (GI)
122929 norovirus Norovirus (GII)
493803 mcv MCV
1891762 bkv BKV
2697049 sars_cov_2 SARS-COV-2
"""
taxid_to_name = {}
for line in taxid_list.split("\n"):
line = line.strip()
if not line: continue
taxid, human_readable = re.match(
"^([0-9]+) +[a-z0-9_]+ +(.*)$", line).groups()
taxid_to_name[int(taxid)] = human_readable
metadata_fname = "metadata.txt"
our_taxid_to_all_fname = "our_taxid_to_all.json"
taxid_to_fastas = defaultdict(list)
with open(metadata_fname) as inf:
cols = None
for line in inf:
rows = line.strip().split("\t")
if not cols:
cols = rows
continue
taxid_to_fastas[int(rows[cols.index("taxid")])].append(
rows[cols.index("local_filename")])
with open(our_taxid_to_all_fname) as inf:
our_taxid_to_all = json.load(inf)
READ_LENGTH=100
def to_reads(s):
for i in range(len(s) - READ_LENGTH + 1):
yield s[i:i+READ_LENGTH]
def fake_reads(fake_reads_fname, our_taxid):
with gzip.open(fake_reads_fname, "w") as outf:
for any_taxid in our_taxid_to_all[str(our_taxid)]:
if any_taxid in taxid_to_fastas:
for fasta in sorted(taxid_to_fastas[any_taxid]):
with gzip.open(fasta) as inf:
raw_genome = []
for line in inf.read().decode("utf-8").split("\n"):
if line.startswith(">"):
continue
raw_genome.append(line.strip())
for n, simulated_read in enumerate(
to_reads("".join(raw_genome))):
outf.write(("@%s:%s\n%s\n+\n%s\n" % (
os.path.basename(
fasta.replace("_genomic.fna.gz", "")),
n,
simulated_read,
"F"*(len(simulated_read)))
).encode("utf-8"))
def classify_reads(fake_reads_fname, classified_fname):
subprocess.check_call([
"/home/ec2-user/kraken2/kraken2",
"--db", "/run/kraken-db/",
"--output", classified_fname,
"--use-names",
fake_reads_fname
])
high_level = []
for our_taxid in taxid_to_name:
if our_taxid == 68558:
continue # no genomes
genomes = 0
for any_taxid in our_taxid_to_all[str(our_taxid)]:
if any_taxid in taxid_to_fastas:
for fasta in sorted(taxid_to_fastas[any_taxid]):
genomes += 1
alias_taxids = our_taxid_to_all[str(our_taxid)]
fake_reads_fname = "%s.fake.reads.fastq.gz" % our_taxid
if not os.path.exists(fake_reads_fname):
fake_reads(fake_reads_fname, our_taxid)
classified_fname = fake_reads_fname + ".classified"
if not os.path.exists(classified_fname):
classify_reads(fake_reads_fname, classified_fname)
our_desc = "%s (taxid %s)" % (taxid_to_name[our_taxid], our_taxid)
total = 0
classification_counts = defaultdict(int)
with open(classified_fname) as inf:
for line in inf:
total += 1
desc = line.split("\t")[2]
classified_taxid = int(desc.split("(taxid ")[-1].replace(")", ""))
if classified_taxid in alias_taxids:
desc = our_desc
classification_counts[desc] += 1
count_classification = [
(count, classification)
for (classification, count) in classification_counts.items()]
count_classification.sort(reverse=True)
print("%s (%s)" % (taxid_to_name[our_taxid], our_taxid))
for count, classification in count_classification:
print (" %s (%.1f%%)\t%s" % (
count, 100*count/total, classification))
high_level.append(
"%s (%s)\t%.1f%%\t%s\t%s" %
(taxid_to_name[our_taxid],
our_taxid,
100*classification_counts[our_desc]/total,
total,
genomes))
print()
for line in high_level:
print(line)