-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractTranscriptJunctions.py
122 lines (90 loc) · 4.14 KB
/
extractTranscriptJunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# DESCRIPTION: This program uses the transcript FASTA file and extracts the junctions made by each pair of exons.
# The output is a bed file with all of these junctions.
#
# AUTHOR: Chelsea Tymms
# Built-in packages
import argparse
import os
# Add-on packages
import gffutils
import itertools
def getOptions():
""" Function to pull in arguments """
parser = argparse.ArgumentParser(description="Import database and give exported file name")
parser.add_argument("--gff", dest="gffInput", action='store', required=True, help="Input GFF or GTF file")
parser.add_argument("--output", dest="outCSV", action='store', required=True, help="Output CSV file name")
args = parser.parse_args()
return args
def start_pos(x):
""" Little function to let me sort gffutils objects by start position """
return x.start
def createExonArray(exonList,mergedExons):
""" Group overlapping exons together as an list within a list """
exonArray = []
for mergedExon in mergedExons:
exonSubset = []
for exon in exonList:
if exon.start >= mergedExon.start and exon.stop <= mergedExon.stop:
exonSubset.append(exon)
exonArray.append(exonSubset)
return exonArray
def createJunctionArray(exonArray):
""" Using itertools create all possible combination between groups of exons """
iterProduct = []
s=len(exonArray)
for i in range(0,s-1):
# for j in range (i+1,len(exonArray)):
j=i+1
for k in itertools.product(exonArray[i],exonArray[j]):
iterProduct.append(k)
return iterProduct
def main():
# Get the database
db_fn=args.gffInput + '.db'
db=gffutils.FeatureDB(db_fn)
rnas=db.features_of_type(['mRNA','ncRNA'])
csvOut = open(args.outCSV, 'w')
# Write column headers for TSV
csvHeader = ','.join(['chrom','start','stop','strand','exon_id','transcript_id','gene_id']) + "\n"
csvOut.write(csvHeader)
# Open the output BED file
with open('aceview_hg19_transcript_junctions.tsv','wb') as outputFile:
for rna in rnas:
# Get all exons
exonList = list(db.children(rna,featuretype='exon',order_by=('start','end')))
# Skip genes that only have 1 exon, or genes that don't have an exon e.g. miRNA
if len(exonList) == 0 or len(exonList) == 1:
continue
# Create list of overlapping exons
# Note: mod(mdg4) has exons on both strands, so ignore_strand needs
# to be set for it
#mergedExons = list(db.merge_features(exonList,ignore_strand=True))
mergedExons = list(db.merge(exonList,ignore_strand=True))
# Sort merged exons by start postiion
mergedExons.sort(key=start_pos)
# Using the merged exons to identify overlapping exons create an array where
# overlapping exons are grouped together
exonArray = createExonArray(exonList,mergedExons)
# Now create all possible junctions
junctionArray = createJunctionArray(exonArray)
# Create BED file
for i in range(0,len(junctionArray)):
# Construct various parts of the junction BED file
junctionID=''.join(junctionArray[i][0].attributes['Name']) + '|' + ''.join(junctionArray[i][1].attributes['Name'])
xscriptGenes=list(db.parents(rna, featuretype='gene'))
xscriptGene=[]
for gene in xscriptGenes:
xscriptGene.append(gene.id)
transcriptID=''.join(rna.id)
geneID=''.join(xscriptGene)
juncCoord=str(junctionArray[i][0].chrom) + ':' + str(junctionArray[i][0].stop) + ':' + str(junctionArray[i][1].start) + ':' + str(junctionArray[i][0].strand)
csvArray=[junctionID, juncCoord, transcriptID, geneID]
# Output to BED file
csvOut.write(",".join(str(i) for i in csvArray)+"\n")
csvOut.close()
if __name__ == '__main__':
# Parse command line arguments
global args
args = getOptions()
main()