-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuildRetainedIntrons.py
228 lines (203 loc) · 11.1 KB
/
buildRetainedIntrons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python
## DESCRIPTION: This program takes a GTF or GFF file and creates putative intron retention events
## from exons. It outputs two files: a 12-column BED file for intron retention events, and a TSV
## describing exon(s) associated with an event. Intron retention events are only made for introns
## that are unambiguously intronic sequence. Events are not made for intron sequences that overlap
## with exonic regions, For plus-strand genes, retained introns are made from the end of exons. For
## minus-strand genes, retained introns are made from the start of exons. Events are not made for the
## last exons for a gene (genomic 3' last for plus-strand genes, genomic 5' first for minus-strand genes).
## Information for last exons are output to only the TSV file with the event ID exonID|no_event" for
## later reference if necessary. The intron_position column in the TSV is the start positon (plus strand)
## or stop position (minus strand) of the intron used in making the intron retention event.
##
## Author: Jeremy R. B. Newman
# Built-in packages
import argparse
# Add-on packages
import gffutils
def getOptions():
""" Function to pull in arguments """
parser = argparse.ArgumentParser(description="Import database and give exported file name")
parser.add_argument("--gff", dest="gffInput", action='store', required=True, help="Input GFF or GTF file")
parser.add_argument("--obed", dest="outBED", action='store', required=True, help="Output BED file name")
parser.add_argument("--otable", dest="outCSV", action='store', required=True, help="Output CSV file name")
parser.add_argument("--size", dest="eventSize", action='store', type=int, required=True, help="Size of intron retention event parts (event size will be twice this)")
args = parser.parse_args()
return args
def main():
# Get the database
db_fn=args.gffInput + '.db'
db=gffutils.FeatureDB(db_fn)
# Open output files for writing
bedOut = open(args.outBED, 'w')
csvOut = open(args.outCSV, 'w')
# Write column headers for TSV
csvHeader = ','.join(['gene_id','event_id','chr','strand','intron_position','exon_id','exon_cat','flag_lastexon']) + "\n"
csvOut.write(csvHeader)
# Make events
genes=db.features_of_type('gene')
for gene in genes:
if gene.strand == '+':
# If plus strand, order exons by ascending start position
exonList = list(db.children(gene, featuretype='exon', order_by='start'))
elif gene.strand == '-':
# If plus strand, order exons by descending stop position
exonList = list(db.children(gene, featuretype='exon', order_by='end', reverse=True))
exonList.append(exonList[len(exonList)-1])
index=1
exonIndex=0
lastExon=len(exonList)-1 # Check for last exon
exonNameList=[]
for exon in exonList:
if index == 1:
#Get ID start stop parents, etc.
exonStart=exon.start
exonStop=exon.end
exonID=''.join(exon.attributes['Name'])
currExonStart=exon.start
currExonStop=exon.end
currExonID=''.join(exon.attributes['Name'])
exonNameList.append(exonID)
index=index+1
exonIndex=exonIndex+1
else:
#Check with previous
exonStart=exon.start
exonStop=exon.end
exonID=''.join(exon.attributes['Name'])
# Overlapping exons check
if gene.strand == '+':
if (exonStart <= currExonStop): # If exon overlaps with current exon
# if exon overlaps check to see what has the bigger coordinates
if (exonStop >= currExonStop): # If exon ends later than current exon
currExonStart=exonStart # Make exon current exon
currExonStop=exonStop
currExonID=exonID
exonNameList.append(exonID) # Append exonID to list of exonIDs
exonIndex=exonIndex+1
else: # Otherwise if new exon write outputs for current exon
exonLength=currExonStop-currExonStart+1
if exonLength > args.eventSize: # Need to make this number customizable
donorStart=currExonStop - args.eventSize
else:
donorStart=currExonStop - exonLength
donorStop=currExonStop
acceptorStart=currExonStop+1
acceptorStop=currExonStop+args.eventSize
intronPos=acceptorStart
# Write output for BED file
eventChrom=gene.chrom
totalStart=donorStart
totalStop=acceptorStop
eventName=currExonID + "|intron"
eventScore="."
eventStrand=gene.strand
color="255,0,0"
blocks="1"
blockLength=str(totalStop-totalStart)
blockStarts="0"
exonListOut='|'.join(exonNameList)
exonEntry='\t'.join([str(eventChrom), str(totalStart), str(totalStop), eventName, eventScore, eventStrand, color, blocks, blockLength, blockStarts]) + "\n"
bedOut.write(exonEntry)
# Write output for TSV file
eventChrom=gene.chrom
eventGene=gene.id
eventName=currExonID + "|intron"
isLastExon='0'
eventStrand=gene.strand
exonListOut='|'.join(exonNameList)
csvEntry=','.join([eventGene, eventName, str(eventChrom), eventStrand, str(intronPos), currExonID, exonListOut, isLastExon]) + "\n"
csvOut.write(csvEntry)
# Get pieces for next exon
exonNameList=[]
exonID=''.join(exon.attributes['Name'])
currExonStart=exon.start
currExonStop=exon.end
currExonID=''.join(exon.attributes['Name'])
exonNameList.append(exonID)
index=index+1
exonIndex=exonIndex+1
if gene.strand == '-':
if (exonStop >= currExonStart): # If exon overlaps with current exon
# if exon overlaps check to see what has the bigger coordinates
if (exonStart <= currExonStart): # If exon start earlier than current exon
currExonStart=exonStart # Make exon current exon
currExonStop=exonStop
currExonID=exonID
exonNameList.append(exonID)
exonIndex=exonIndex+1
else: # Otherwise if new exon write outputs for current exon
exonLength=currExonStop-currExonStart+1
if exonLength > args.eventSize: # Need to make this number customizable
acceptorStop=currExonStart + args.eventSize
else:
acceptorStop=currExonStart + exonLength
acceptorStart=currExonStart
donorStart=currExonStart-args.eventSize
donorStop=currExonStart-1
intronPos=donorStop
# Write output for BED file
eventChrom=gene.chrom
totalStart=donorStart
totalStop=acceptorStop
eventName=currExonID + "|intron"
eventScore="."
eventStrand=gene.strand
color="255,0,0"
blocks="1"
blockLength=str(totalStop-totalStart)
blockStarts="0"
exonListOut='|'.join(exonNameList)
exonEntry='\t'.join([str(eventChrom), str(totalStart), str(totalStop), eventName, eventScore, eventStrand, color, blocks, blockLength, blockStarts]) + "\n"
bedOut.write(exonEntry)
# Write output for TSV file
eventChrom=gene.chrom
eventGene=gene.id
eventName=currExonID + "|intron"
isLastExon='0'
eventStrand=gene.strand
exonListOut='|'.join(exonNameList)
csvEntry=','.join([eventGene, eventName, str(eventChrom), eventStrand, str(intronPos), currExonID, exonListOut, isLastExon]) + "\n"
csvOut.write(csvEntry)
# Get pieces for next exon
exonNameList=[]
exonID=''.join(exon.attributes['Name'])
currExonStart=exon.start
currExonStop=exon.end
currExonID=''.join(exon.attributes['Name'])
exonNameList.append(exonID)
index=index+1
exonIndex=exonIndex+1
if exonIndex == lastExon:
#Get ID start stop parents, etc.
exonStart=exon.start
exonStop=exon.end
eventName=currExonID + "|no_event"
if gene.strand == '+':
intronPos=exonStop+1
elif gene.strand == '-':
intronPos=exonStart-1
exonID=''.join(exon.attributes['Name'])
currExonStart=exon.start
currExonStop=exon.end
currExonID=''.join(exon.attributes['Name'])
#exonNameList.append(exonID)
intronPos=-9999999
# Write output for TSV file
eventChrom=gene.chrom
eventGene=gene.id
eventName=currExonID + "|no_event"
isLastExon='1'
eventStrand=gene.strand
exonListOut='|'.join(exonNameList)
csvEntry=','.join([eventGene, eventName, str(eventChrom), eventStrand, str(intronPos), currExonID, exonListOut, isLastExon]) + "\n"
csvOut.write(csvEntry)
index=index+1
exonIndex=exonIndex+1
bedOut.close()
csvOut.close()
if __name__ == '__main__':
# Parse command line arguments
global args
args = getOptions()
main()