-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathSTAR_geneCnt_integrate.py
executable file
·147 lines (132 loc) · 5.01 KB
/
STAR_geneCnt_integrate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division, with_statement
'''
Copyright 2015, 陈同 ([email protected]).
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = '[email protected]'
#=========================================================
desc = '''
Program description:
This is designed to summarize reads count output by `STAR`.
N_unmapped 397592 397592 397592
N_multimapping 2281519 2281519 2281519
N_noFeature 3391357 13297995 13358706
N_ambiguous 1200892 251064 223762
ENSG00000223972.5 0 0 0
ENSG00000227232.5 0 0 0
ENSG00000278267.1 0 0 0
ENSG00000243485.3 0 0 0
'''
import sys
import os
from json import dumps as json_dumps
from time import localtime, strftime
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
import re
from tools import *
#from multiprocessing.dummy import Pool as ThreadPool
import pandas as pd
#from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
debug = 0
def fprint(content):
"""
This is a Google style docs.
Args:
param1(str): this is the first param
param2(int, optional): this is a second param
Returns:
bool: This is a description of what is returned
Raises:
KeyError: raises an exception))
"""
print json_dumps(content,indent=1)
def cmdparameter(argv):
if len(argv) == 1:
global desc
print >>sys.stderr, desc
cmd = 'python ' + argv[0] + ' -h'
os.system(cmd)
sys.exit(1)
usages = "%prog -f file"
parser = OP(usage=usages)
parser.add_option("-f", "--files", dest="filein",
metavar="FILEIN", help="`,` or ` ` separated a list of files. *.Log.final.out generated by `STAR` during mapping")
parser.add_option("-l", "--labels", dest="label",
metavar="LABEL", help="`,` or ` ` separated a list of labels to label each file. It must have same order as files.")
parser.add_option("-o", "--output-file", dest="out_file",
help="The name of output file.")
parser.add_option("-H", "--header", dest="header",
default=0, type='int', help="Given <1> here to indicate the first line as header line/ Default <0> meaning no header line.")
parser.add_option("-s", "--skip-lines", dest="skip_line",
default=4, type='int',
help="Given a number to skip the first n lines. Default n=4.")
parser.add_option("-c", "--extract-cols", dest="col_number",
default=2, type='int',
help="Given a number to specify which column to extract. The \
first column will be used as index. Default <2> meaning the 2nd column.")
parser.add_option("-v", "--verbose", dest="verbose",
action="store_true", help="Show process information")
parser.add_option("-D", "--debug", dest="debug",
default=False, action="store_true", help="Debug the program")
(options, args) = parser.parse_args(argv[1:])
assert options.filein != None, "A filename needed for -i"
return (options, args)
#--------------------------------------------------------------------
def readAndMergeMatrix(fileL, labelL, header, skip_line, col_number):
if header:
header = 0
else:
header = None
tmpL = []
for file, label in zip(fileL, labelL):
data = pd.read_table(file, header=header, index_col=0,
skiprows=skip_line, usecols=[0, col_number])
data.columns = [label]
data.index.name = "ENSG"
tmpL.append(data)
mergeM = pd.concat(tmpL, axis=1)
return mergeM
#----------------------------------------------------------
def main():
options, args = cmdparameter(sys.argv)
#-----------------------------------
file = options.filein
fileL = re.split(r'[, ]*', file.strip())
label = options.label
labelL = re.split(r'[, ]*', label.strip())
verbose = options.verbose
op = options.out_file
header = options.header
skip_line = options.skip_line
col_number = options.col_number - 1
global debug
debug = options.debug
#-----------------------------------
mergeM = readAndMergeMatrix(fileL, labelL, header, skip_line, col_number)
mergeM = mergeM.fillna(0)
mergeM = mergeM.loc[(mergeM>0).any(axis=1)]
mergeM.to_csv(op, sep=b"\t")
#--------------------------------------------
if __name__ == '__main__':
startTime = strftime(timeformat, localtime())
main()
endTime = strftime(timeformat, localtime())
fh = open('python.log', 'a')
print >>fh, "%s\n\tRun time : %s - %s " % \
(' '.join(sys.argv), startTime, endTime)
fh.close()
###---------profile the program---------
#import profile
#profile_output = sys.argv[0]+".prof.txt")
#profile.run("main()", profile_output)
#import pstats
#p = pstats.Stats(profile_output)
#p.sort_stats("time").print_stats()
###---------profile the program---------