-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_get_sequence.py
151 lines (131 loc) · 4.95 KB
/
1_get_sequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Collecting sequence data and clustering from Smith et al. 2004 paper.
URL: https://science.sciencemag.org/content/305/5682/371.
"""
import os
import re
import json
from collections import defaultdict
from Bio import Entrez, SeqIO, AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from pyutils import logging, DATA_DIR, HA1_ALIGNED_FILE
from pyutils.entrez_data import PMID, ADDITIONAL, REGION_NAMES
from pyutils.miscellaneous import run_cmd
HA1_REFERENCE_FILE = os.path.join(DATA_DIR, "HA1_reference.fasta")
# Internet access to NCBI entrez service is required
Entrez.email = "[email protected]"
logging.info("Use PMID to retrieve the ids of associated sequences")
# Should be only one record
(res,) = Entrez.read(Entrez.elink(linkname="pubmed_nuccore", id=PMID, idtype="acc"))
# Should be only one record
(res,) = res["LinkSetDb"]
# Add missing ids
IdList = [*[acc["Id"] for acc in res["Link"]], *ADDITIONAL]
logging.info("Download the sequences from Smith's paper")
# Download and parse in Genbank format
handle = Entrez.efetch(db="nuccore", id=IdList, rettype="gb", retmode="text")
records = [record for record in SeqIO.parse(handle, "gb")]
# Read in reference sequence
refSeq = SeqIO.read(HA1_REFERENCE_FILE, "fasta")
outSeqs = [refSeq]
logging.info("Map strain names and names in Genbank record")
seqname2ac = defaultdict(set)
for record in records:
organism = None
organism2 = None
seq = None
for feature in record.features:
if feature.type == "source":
# Get the organism name
(organism,) = feature.qualifiers["organism"]
m = re.search(r"/[A-Za-z ]+/[A-Za-z0-9]+/[0-9]+", organism)
organism = m.group(0)
_, region, name, year = organism.split("/")
m = re.search(r"[0-9]+", name)
name = m.group(0)
# The renamed organism for mapping cluster name
organism = (REGION_NAMES[region], name, year[-2:])
organism2 = organism
# Get the isolate/strain name of the virus
if "strain" in feature.qualifiers:
(strain,) = feature.qualifiers["strain"]
elif "isolate" in feature.qualifiers:
(strain,) = feature.qualifiers["isolate"]
# Correct the year if possible
year2 = strain.split("/")[-1]
if year2:
year2 = year2.split('(')[0].strip()
organism2 = (REGION_NAMES[region], name, year2[-2:])
if feature.type == "CDS":
(seq,) = feature.qualifiers["translation"]
# Map the renamed organism with accession id
seqname2ac[organism].add(record.id)
seqname2ac[organism2].add(record.id)
outSeqs.append(SeqRecord(seq=Seq(seq), id=record.id, description=""))
sequences_filename = "sequences.fasta"
sequences_file = os.path.join(DATA_DIR, sequences_filename)
SeqIO.write(outSeqs, sequences_file, "fasta")
logging.info("Parse the copied cluster info")
# The file was copied and pasted
clusters = {}
with open(os.path.join(DATA_DIR, "metadata_copied.txt")) as f:
for row in f:
row = row.split(" ")
seqname = row[1].split("/")
m = re.search(r"[0-9]+", seqname[1])
seqname[1] = m.group(0)
if seqname[0] == "OV":
seqname[0] = "MA"
seqname = tuple(seqname)
for ac in seqname2ac[seqname]:
clusters[ac] = row[0]
with open(os.path.join(DATA_DIR, "metadata.json"), "w") as f:
json.dump(clusters, f, indent=4)
assert all(record.id in clusters for record in records)
# Align sequences with muscle
aligned_filename = "aligned.fasta"
run_cmd("muscle", "-align", sequences_filename, "-output", aligned_filename)
aligned_file = os.path.join(DATA_DIR, "aligned.fasta")
logging.info("Remove the gaps from reference sequence for the site numbering")
# Read in the output MSA
seqs = AlignIO.read(aligned_file, "fasta")
refSeq_index = None
for n, record in enumerate(seqs):
if record.id == refSeq.id:
refSeq_index = n
break
assert refSeq_index is not None
# Indices for ungapped reference sequences
prev_gap_index = -1
prev_aa_index = -1
start_index = None
end_index = None
reference_indexes = []
for index, aa in enumerate(seqs[refSeq_index]):
if aa == "-":
if index != prev_gap_index + 1:
print(index, aa)
reference_indexes.append((start_index, end_index))
prev_gap_index = index
else:
if index != prev_aa_index + 1:
start_index = index
end_index = index + 1
prev_aa_index = index
non_gap_seqs = None
for start_index, end_index in reference_indexes:
if non_gap_seqs:
non_gap_seqs += seqs[:, start_index:end_index]
else:
non_gap_seqs = seqs[:, start_index:end_index]
out_aligned_seqs = []
for n, record in enumerate(non_gap_seqs):
if n != refSeq_index:
out_aligned_seqs.append(record)
AlignIO.write(
MultipleSeqAlignment(out_aligned_seqs),
HA1_ALIGNED_FILE,
"fasta"
)