-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClustersSeq_Corset.py
111 lines (91 loc) · 3.91 KB
/
ClustersSeq_Corset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#######################################################################################
#
# Get the nucleotide sequence of the largest transcript in each Corset cluster
#
# Usage: ClustersSeq_Corset.py -f <fasta> -l <input1> -c <input2> -o <output>
#
# Where:
#
# fasta = the fasta file containing all the transcripts used as reference for the realigment
# of the reads before running the Corset
# input1 = List of Cluster IDs to recover (one per line)
# input2 = Result table from Corset containing the clusters and transcripts IDs (***-clusters.txt)
# outputfile = File in fasta format where the output will be saved i.e. The largest transcript
# of each Cluster and its nucleotide sequence
#
#######################################################################################
#!/usr/bin/python
import sys, getopt
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
genes_size = dict()
genes_seq = dict()
cluster_dic = dict()
clusters = list()
Ids = list()
fasta = ""
input1 = ""
input2 = ""
outputfile = ""
# Check for the arguments and print useful help messages
try:
opts, args = getopt.getopt(sys.argv[1:],"hf:l:c:o:",["fasta=","ifile1=","ifile2=","output="])
except getopt.GetoptError:
print '\n', '#### Invalid use ####', '\n'
print 'Usage: ClustersSeq_Corset.py -f <fasta> -l <input1> -c <input2> -o <output>'
print 'For help use ClustersSeq_Corset.py -h'
sys.exit(99)
if len(args) != 1:
print '\n', '### Arguments are missing ###', '\n', '\n' 'Use -h option for help'
sys.exit(1)
for opt, arg in opts:
if opt == '-h':
print '\n', 'Get the nucleotide sequence of the largest transcript in each Corset cluster', '\n'
print 'Usage: ClustersSeq_Corset.py -f <fasta> -l <input1> -c <input2> -o <output>', '\n'
print 'Where: fasta = the fasta file containing all the transcripts used as reference for the realigment of the reads before running the Corset'
print 'input1 = List of Cluster IDs to recover (one per line)'
print 'input2 = Result table from Corset containing the clusters and transcripts IDs (***-clusters.txt)'
print 'outputfile = File in fasta format where the output will be saved i.e. The largest transcript of each Cluster and its nucleotide sequence'
sys.exit()
elif opt in ("-f", "--fasta"):
fasta = open(arg)
elif opt in ("-l", "--ifile1"):
input1 = open(arg)
elif opt in ("-c", "--ifile2"):
input2 = open(arg)
elif opt in ("-o", "--output"):
outname = arg + ".fasta"
outputfile = open(outname,"w")
else:
assert False, "unhandled option"
## Make a list of the cluster IDs
for id in input1:
clusters.append(id)
## Open the fasta file and save the information to be used later
for seq_record in SeqIO.parse(fasta, "fasta"):
gene_id = seq_record.id
seq_length = (len(seq_record))
bases = (seq_record.seq)
genes_size[gene_id]= seq_length
genes_seq[gene_id]= bases
## Find the transcript ID foreach clusterID in the list and check which transcripts are larger within the same cluster
for line in input2:
table_line = line.split("\t")
trans_ID = table_line[0]
cluster_ID = table_line[1]
if cluster_ID in clusters and cluster_ID not in cluster_dic:
cluster_dic[cluster_ID] = trans_ID
elif cluster_ID in cluster_dic:
old_trans = cluster_dic[cluster_ID]
new_trans = trans_ID
if genes_size[new_trans] > genes_size[old_trans]:
cluster_dic[cluster_ID] = trans_ID
## Save the Ids of the longest transcripts in a list
for k,v in cluster_dic.items():
Ids.append(v)
## Recover the nucleotide sequence of the longest transcripts and save it in the output fasta file
for k,v in genes_seq.items():
if k in Ids:
fasta_format_string = SeqRecord(v, id=k)
outputfile.write(fasta_format_string.format("fasta"))
outputfile.close()