-
Notifications
You must be signed in to change notification settings - Fork 0
/
rsem_per_gene_counts.py
executable file
·80 lines (67 loc) · 2.65 KB
/
rsem_per_gene_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
"""Simple summation over isoforms - testing the math used by RSEM """
__author__ = "Yury Barbitoff"
__copyright__ = "Copyright 2015, Yury Barbitoff"
__license__ = "CC"
__version__ = "1.0"
__email__ = "[email protected]"
__status__ = "Release"
import sys
import getopt
def optRecognize(argv):
"""Processing given command line options"""
try:
opts, args = getopt.getopt(argv, "ha:", ["help", "annot="])
except:
print('\nInvalid options. To see usage info, type kallisto_pgc -h\n')
sys.exit()
annotationfile, quantfiles = None, None
for opt, arg in opts:
if opt in ('-h', '--help'):
print('''\n\nTo turn kallisto output into per-gene counts:
kallisto_pgc -a <file> [arguments]
-a (--annot) <file> - GTF/GFF file containing annotation of the genome
[arguments] - (multiple input files supported) kallisto quantification output in TSV format
=================================
-h (--help) To show the help message again\n''')
sys.exit()
elif opt in ("-a", "--annot"):
annotationfile = arg
quantfiles = args
return annotationfile, quantfiles
def transcriptsToGenes(annotationfile):
"""Creating transcript-gene map"""
tgr = {}
with open(annotationfile, 'r') as annotation:
for line in annotation:
content = line.split()
if len(content) < 15:
continue
if content[2] == 'transcript':
transcript_id = content[11][1:len(content[11])-2]
gene_id = content[9][1:len(content[9])-2]
tgr[transcript_id] = gene_id
return tgr
def writeGeneCounts(quantfiles, tgr):
"""Turning transcript-based counts into gene-based counts"""
for quantfile in quantfiles:
outfile = quantfile.replace('.tsv', '_per_gene.tsv')
gene_counts = {}
with open(quantfile, 'r') as abundances:
for line in abundances:
if line.startswith('target'):
continue
content = line.split()
gene_id = tgr[content[0]]
gene_counts[gene_id] = gene_counts.get(gene_id, 0) + float(content[1])
with open(outfile, 'w') as out:
out.write('target_id\test_counts\ttpm\n')
for gene_id, count in gene_counts.items():
out.write(gene_id + '\t' + str(count) + '\n')
# Get input options
if __name__ == "__main__":
annotationfile, quantfiles = optRecognize(sys.argv[1:])
print '\n...patient should you be, young padawan...'
tgr = transcriptsToGenes(annotationfile)
writeGeneCounts(quantfiles, tgr)
print '...done\n'