-
Notifications
You must be signed in to change notification settings - Fork 0
/
checkSplice.py
213 lines (175 loc) · 7.62 KB
/
checkSplice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
"""
checkSplice.py
This script checks that the expected splice sites are present in V, J, and C
genes found by blast.
Created by Chaim A Schramm on 2019-08-06.
Switched from CDS to exon and pulled exon labels through by Simone Olubo 2024-04-15.
Some quick clean up by CA Schramm 2024-04-16.
Moved check for start codon to new script by CA Schramm 2024-04-16.
Removed lines 89-110 and pulled full gene through for the alignment by Simone Olubo & CA Schramm 2024-09-26
Added debugging for V-region by S Olubo & CA Schramm 2024-10-01
Fixed start position numbering for D/J genes and removed splice checking for
D genes by CA Schramm 2024-10-09.
Fixed exon type regex by CA Schramm 2024-10-09.
Copyright (c) 2019-2024 Vaccine Research Center, National Institutes of Health, USA.
All rights reserved.
"""
from pybedtools import BedTool
import warnings
from Bio import BiopythonWarning
from Bio import SeqIO
from Bio.Seq import Seq
import re, sys
#returns if there is a gap or not
def gapFinder( exons, seqFile ):
for e in exons:
s = BedTool.seq( (e.chrom,e.start,e.stop), seqFile ).upper()
#don't think it pays to worry about gaps of 1 or 2 bases, arbitrarily setting threshold at 10 bases
if "NNNNNNNNNN" in s:
return True
return False
def reindexExons(e, minpos, maxpos):
if e.strand == "+":
e.start = e.start - minpos
e.stop = e.stop - minpos
else:
temp = e.start
e.start = maxpos - e.stop
e.stop = maxpos - temp
e.strand = "+"
return e
def mapExons(e, posDict, source ):
exon_type = re.sub("-exon", "", e.name)
#convert exon coordinates to contig being annotated
e.chrom = source.chrom
e.name = exon_type
e.strand = source.strand
if source.strand == "+":
e.start = posDict[ e.start ]['test'] + source.start
e.stop = posDict[ e.stop ]['test'] + source.start
else:
temp = e.start
e.start = source.stop - posDict[ e.stop ]['test']
e.stop = source.stop - posDict[ temp ]['test']
return e
def checkSplice( hits, bedfile, targetSeq, contigs, gene, blast_exec, codingSeq ):
from aligator import blast2bed, quickAlign, GAPPED_CODON_TABLE
results = dict()
reasons = dict()
notes = dict()
targetBed = BedTool(bedfile)
#iterate through hits
for h in hits:
stringhit = "\t".join(h[0:6])
pseudo = False
#check if at least one of the hits has annotated exons
names = re.sub(" .*$","",h.name).split(",")
exons = targetBed.filter(lambda x: names[0] in x.name and ("exon" in x.name or "V-Region" in x.name)).saveas()
fullGene = targetBed.filter(lambda x: names[0] in x.name and "gene" in x.name).saveas()
#now get sequences and align them
s = BedTool([h]).sequence(fi=contigs,fo="annoTemp/temp.fa",s=True)
with open("annoTemp/temp.fa",'r') as handle:
testSeq = SeqIO.read(handle, "fasta")
r = fullGene.sequence(fi=targetSeq,fo="annoTemp/ref.fa",s=True)
with open("annoTemp/ref.fa",'r') as handle:
refSeq = SeqIO.read(handle, "fasta")
align = quickAlign(refSeq, testSeq)
#make sure the feature numbering corresponds to the extracted sequence
# by rev-comp-ing if necessary and subtracting off the start position
# of the first exon. Makes sure we know which is the splice donor and
# which is the splice acceptor.
finalExons = exons.each( reindexExons, fullGene[0].start, fullGene[0].stop ).saveas()
finalExons = finalExons.sort() #if they were negative strand, this puts the flipped exons in order
#extract and copy exon/domain descriptions
#kludgy, but start by making a dictionary of corresponding positions
#can use this to try and extend incomplete hits, if need be
posDict = dict()
refPos = -1
testPos = -1
for p in range(len(align['ref'])):
if not align['test'][p] == "-": testPos += 1
if not align['ref'][p] == "-":
refPos += 1
posDict[ refPos ] = { 'align':p, 'test':testPos, 'gap':False }
if testPos == -1:
posDict[ refPos ]['test'] = 0 #special case where blast hit is missing 5' end; don't try to claw it back
if align['test'][p] == "-": posDict[ refPos ][ 'gap' ] = True
#add closed boundaries:
if align['ref'][-1] == "-":
#hit went past last exon, set boundary at testPos of last non gap
posDict[ refPos+1 ] = { 'test':posDict[refPos]['test']+1, 'gap':False }
else:
posDict[ refPos+1 ] = { 'test':testPos+1, 'gap':False }
if align['test'][-1] == "-": posDict[ refPos+1 ][ 'gap' ] = True
#check for splice sites for gaps and proper motifs
# also check if the ends of the alignment are missing
incomplete5 = False
incomplete3 = False
if gene != "D":
missingExons = []
for i in range(len(finalExons)):
if "V-Region" in finalExons[i].name:
continue
if i > 0: #C acceptor handled below
acceptor = re.sub("-","",align['test'][ posDict[finalExons[i].start]['align']-10 : posDict[finalExons[i].start]['align'] ]) #if there are gaps here, it's probably bad anyway, but trying for a safety margin
if not acceptor.endswith("AG"):# or acceptor.endswith("AC"):
pseudo = True
reasons[ stringhit ] = f"noncanonical splice acceptor {acceptor[-2:]}" #"a bad splice acceptor"
#break
if posDict[ finalExons[i].start ]['gap']:
if i==0:
notes[ stringhit ] = f"incomplete 5' end"
incomplete5 = True
else:
notes[ stringhit ] = f"splice acceptor in alignment gap in exon {i+1}"
if i < len(finalExons)-1: #J donor handled below
donor = re.sub("-","",align['test'][ posDict[finalExons[i].stop]['align'] : posDict[finalExons[i].stop]['align']+10 ]) #if there are gaps here, it's probably bad anyway, but trying for a safety margin
if not donor.startswith("GT") and finalExons[i].name == "V-exon":# or donor.startswith("GC"):
with warnings.catch_warnings():
warnings.simplefilter('ignore', BiopythonWarning)
if not (gene=="C" and Seq(donor).translate().startswith("*")):
#assume a stop codon at an exon boundary is always CHS...
pseudo = True
reasons[ stringhit ] = f"noncanonical splice donor {donor[0:2]}" #"a bad splice donor"
#break
if posDict[ finalExons[i].stop ]['gap']:
if i==len(finalExons)-1:
notes[ stringhit ] = f"incomplete 3' end"
incomplete3 = True
else:
notes[ stringhit ] = f"splice donor in alignment gap in exon {i+1}"
# update coordinates/names to contig being annotated
mapped = finalExons.each( mapExons, posDict, h ).saveas()
mapped = mapped.sort() #in case h is negative strand
#remove exons that are empty
mapped = [ mapped[i] for i in range(len(mapped)) if mapped[i].stop - mapped[i].start > 1 ]
if len(mapped)==0:
notes[ stringhit ] = "seems to be a spurious blast hit"
continue
#J and C: extract post/pre nt to verify splicing
# but don't bother if we've already marked the ends as missing
if gene == "J" and not incomplete3:
if h.strand == "+":
s = BedTool.seq((mapped[0].chrom,mapped[0].stop,mapped[0].stop+2), contigs )
if not s == "GT":
pseudo = True
reasons[ stringhit ] = "a bad splice donor"
else:
s = BedTool.seq((mapped[0].chrom,mapped[0].start-2,mapped[0].start), contigs )
if not s == "AC":
pseudo = True
reasons[ stringhit ] = "a bad splice donor"
elif gene == "C" and not incomplete5:
if h.strand == "+":
s = BedTool.seq((mapped[0].chrom,mapped[0].start-2,mapped[0].start), contigs )
if not s == "AG":
pseudo = True
reasons[ stringhit ] = "a bad splice acceptor"
else:
s = BedTool.seq((mapped[len(mapped)-1].chrom,mapped[len(mapped)-1].stop,mapped[len(mapped)-1].stop+2), contigs )
if not s == "CT":
pseudo = True
reasons[ stringhit ] = "a bad splice acceptor"
results[ stringhit ] = mapped
return results, reasons, notes