-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHTSeqQA_reducer.py
executable file
·198 lines (163 loc) · 6.3 KB
/
HTSeqQA_reducer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python
#
#
# """HTSeqQA_reducer.py"""
#
#
__author__ ="Alexey Siretskiy"
__email__ ="[email protected]"
__date__ ="2014"
__doc__ ="""This script is a part of the HTSeq-Hadoo package for analyzing high-throughput sequencing reads or ``SAM`` files. The script reads the data from ``STDIN`` and producing plots showing the distribution of called bases and base-call quality scores by position within the reads. The plots are sent to ``STDOUT`` in the ``SVG`` format. Should be used with :mod:`HTSeq_Hadoop.HTSeqQA_mapper` script."""
#%s, IT dept, Uppsala Univervity, Sweden, %s, %s""" %(__author__ ,__email__, __date__)
import sys, time, os.path, argparse
from itertools import *
import numpy as np
import HTSeq
from operator import itemgetter
def read_mapper_output(file, separator='\t'):
"""
reads the STDIN lazily and ouputs the **<key,value>** separates by TAB
"""
for line in file:
yield line.rstrip().split(separator, 1)
try:
import matplotlib
except ImportError:
sys.stderr.write("This script needs the 'matplotlib' library, which ")
sys.stderr.write("was not found. Please install it." )
matplotlib.use('svg')
from matplotlib import pyplot
# **** Normalize result ****
def norm_by_pos( arr ):
"""
normalizes array by dividing each element in the row by the sum of the row
:param arr: 2D array
:type arr: `numpy.int`
:return arr: normalized array
:rtype: `numpy.float`
"""
arr = np.array( arr, np.float )
arr_n = ( arr.T / arr.sum( 1 ) ).T
arr_n[ arr == 0 ] = 0
return arr_n
def norm_by_start( arr ):
"""
normalizes array by the first element of the array, resulted after summing up elements in the rows
:param arr: 2D array
:type arr: `numpy.int`
:return arr: arr -- normalized array
:rtype: `numpy.float`
"""
arr = np.array( arr, np.float )
arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T
arr_n[ arr == 0 ] = 0
return arr_n
def plot_bases( arr ):
"""
prepares the Matplotlib canvas
"""
readlen = len(arr)
print readlen
xg = np.arange( readlen )
pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red')
pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen')
pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen')
pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange')
pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey')
pyplot.axis( (0, readlen-1, 0, 1 ) )
pyplot.text( readlen*.70, .9, "A", color="red" )
pyplot.text( readlen*.75, .9, "C", color="darkgreen" )
pyplot.text( readlen*.80, .9, "G", color="lightgreen" )
pyplot.text( readlen*.85, .9, "T", color="orange" )
pyplot.text( readlen*.90, .9, "N", color="grey" )
def main(separator='\t'):
# **** Parse command line ****
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( "-g", "--gamma", type=float, dest="gamma", default = 0.3,\
help="the gamma factor for the contrast adjustment of the quality score plot, default 0.3" )
opts = parser.parse_args()
gamma = opts.gamma
twoColumns = True
# input comes from STDIN (standard input)
data = read_mapper_output(sys.stdin, separator=separator)
# groupby groups multiple word-count pairs by word,
# and creates an iterator that returns consecutive keys and their group:
# current_word - string containing a word (the key)
# group - iterator yielding all ["<current_word>","<count>"] items
former_element = None
array_types = {}
for current_element, group in groupby(data, itemgetter(0)):
try:
total_count = sum(np.fromstring(line, sep=' ', dtype=int) for current_element, line in group)
# print "%s%s%s" % (current_element, separator, ' '.join(str(x) for x in total_count))
# print current_element, total_count
if not former_element:
arr = total_count
former_element = current_element
else:
if current_element.split('_')[:-1] == former_element.split('_')[:-1]:
arr = np.vstack([arr,total_count])
else:
#print 'change array', former_element, current_element
array_types['_'.join(former_element.split('_')[:-1])] = arr
former_element = current_element
arr = total_count
except ValueError:
# count was not a number, so silently discard this item
pass
array_types['_'.join(former_element.split('_')[:-1])] = arr
base_arr_U_n = norm_by_pos( array_types['base_arr_U'] )
qual_arr_U_n = norm_by_start( array_types['qual_arr_U'] )
nreads_U = array_types['nreads_U'] # base_arr_U[0,:].sum()
if twoColumns:
base_arr_A_n = norm_by_pos( array_types['base_arr_A'] )
qual_arr_A_n = norm_by_start( array_types['qual_arr_A'] )
nreads_A = array_types['nreads_A'] #base_arr_A[0,:].sum()
# get the read length from the data
readlen = len(base_arr_A_n)
#get the max qual from the data
max_qual = len(qual_arr_U_n[0,:])
# print readlen, max_qual, len(base_arr_U_n)
# sys.exit()
# print "number of aligned reads", base_arr_A[50,:].sum()
# print "number of unaligned reads", base_arr_U[60,:].sum()
#
#
## **** Make plot ****
#
pyplot.figure()
pyplot.subplots_adjust( top=.85 )
pyplot.suptitle( os.path.basename("readfilename"), fontweight='bold' )
if twoColumns:
pyplot.subplot( 221 )
plot_bases( base_arr_U_n )
pyplot.ylabel( "proportion of base" )
pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" %( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) )
pyplot.subplot( 222 )
plot_bases( base_arr_A_n )
pyplot.title( "aligned reads\n%.0f%% (%.3f million)" %( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) )
pyplot.subplot( 223 )
pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
norm=pyplot.normalize( 0, 1 ) )
pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
pyplot.xlabel( "position in read" )
pyplot.ylabel( "base-call quality score" )
pyplot.subplot( 224 )
pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
norm=pyplot.normalize( 0, 1 ) )
pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
pyplot.xlabel( "position in read" )
else:
pyplot.subplot( 211 )
plot_bases( base_arr_U_n )
pyplot.ylabel( "proportion of base" )
pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) )
pyplot.subplot( 212 )
pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
norm=pyplot.normalize( 0, 1 ) )
pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
pyplot.xlabel( "position in read" )
pyplot.ylabel( "base-call quality score" )
pyplot.savefig( sys.stdout )
if __name__ == "__main__":
main()