-
Notifications
You must be signed in to change notification settings - Fork 0
/
2023-03-24--human-viruses.py
executable file
·152 lines (131 loc) · 5.05 KB
/
2023-03-24--human-viruses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
import os
import glob
import json
from collections import Counter
from collections import defaultdict
"""
Generate data for interactive HTML for exploring the taxonomy of human viruses
identified in the samples.
Columns are bioprojects, rows are taxonomic nodes, cells are relative abundances.
Rows can be expanded or collapsed to show child nodes.
"""
MGS_PIPELINE_DIR="/Users/jeffkaufman/code/mgs-pipeline"
human_viruses = set() # {taxid}
with open("%s/human-viruses.tsv" % MGS_PIPELINE_DIR) as inf:
for line in inf:
taxid, name = line.strip().split("\t")
taxid = int(taxid)
human_viruses.add(taxid)
parents = {} # child_taxid -> parent_taxid
with open("nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = \
line.replace("\t|\n", "").split("\t|\t")
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
parents[child_taxid] = parent_taxid
# taxid -> node
nodes = {}
mentioned_taxids = set()
for virus_taxid in human_viruses:
taxid = virus_taxid
mentioned_taxids.add(taxid)
while taxid != 1:
taxid = parents[taxid]
mentioned_taxids.add(taxid)
for taxid in mentioned_taxids:
nodes[taxid] = [taxid]
for virus_taxid in human_viruses:
taxid = virus_taxid
node = nodes[taxid]
while taxid != 1:
taxid = parents[taxid]
if node in nodes[taxid]:
break
nodes[taxid].append(node)
node = nodes[taxid]
# Format: [taxid, children]
# Ex: [1, [10239, [10472,
# [10473, [46014],
# [10474, [10475, ...],
# [693626, ...],
# [1299307, ...]]], ...], ...], ...]
tree = nodes[1]
# taxid -> [name]
# first name is scientific name
names = {}
with open("names.dmp") as inf:
for line in inf:
taxid, name, unique_name, name_class = line.replace(
"\t|\n", "").split("\t|\t")
taxid = int(taxid)
if taxid in mentioned_taxids:
if taxid not in names:
names[taxid] = []
if name_class == "scientific name":
names[taxid].insert(0, name)
else:
names[taxid].append(name)
# project -> accession -> n_reads
project_sample_reads = defaultdict(dict)
for n_reads_fname in glob.glob(
"%s/bioprojects/*/metadata/*.n_reads" % MGS_PIPELINE_DIR):
project = n_reads_fname.split("/")[-3]
accession = n_reads_fname.split("/")[-1].replace(".n_reads", "")
if os.path.exists("%s.humanviruses.tsv" % accession):
with open(n_reads_fname) as inf:
project_sample_reads[project][accession] = int(inf.read())
projects = list(sorted(project_sample_reads))
bioproject_names = {} # project -> bioproject name
for project in projects:
with open("%s/bioprojects/%s/metadata/name.txt" % (
MGS_PIPELINE_DIR, project)) as inf:
bioproject_names[project] = inf.read().strip()
bioproject_links = {} # project -> paper link
for project, name in bioproject_names.items():
paper_dir = os.path.join(MGS_PIPELINE_DIR, "papers", name.replace(" ", ""))
link_fname = os.path.join(paper_dir, "link.txt")
if os.path.exists(link_fname):
with open(link_fname) as inf:
bioproject_links[project] = inf.read().strip()
bioproject_totals = {}
# virus -> project -> [count, relab]
virus_project_counts = {}
for virus_taxid in human_viruses:
virus_project_counts[virus_taxid] = {}
for project in projects:
project_count = 0
project_total = 0
for accession in project_sample_reads[project]:
project_total += project_sample_reads[project][accession]
hv = "%s.humanviruses.tsv" % accession
# this is terrible: we should be opening each file once, not once
# per virus
with open(hv) as inf:
for line in inf:
taxid, count, name = line.strip().split("\t")
taxid = int(taxid)
count = int(count)
if taxid == virus_taxid:
project_count += count
if project_count > 0:
virus_project_counts[virus_taxid][project] = (
project_count, project_count / project_total)
bioproject_totals[project] = project_total
sorted_projects = [project for key, project in
sorted(((bioproject_names[project].split()[-1],
bioproject_names[project]),
project)
for project in projects)]
with open("data.js", "w") as outf:
for name, val in [
("virus_project_counts", virus_project_counts),
("projects", sorted_projects),
("names", names),
("bioproject_names", bioproject_names),
("bioproject_links", bioproject_links),
("bioproject_totals", bioproject_totals),
("tree", tree)]:
outf.write("%s=%s;\n" % (name, json.dumps(
val, sort_keys=True, indent=None if name == "tree" else 2)))