forked from ding-lab/neoscan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathguess-encoded.py
66 lines (55 loc) · 2 KB
/
guess-encoded.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
awk 'NR % 4 == 0' your.fastq | python %prog [options]
guess the encoding of a stream of qual lines.
"""
import sys
import optparse
RANGES = {
'Sanger': (33, 73),
'Solexa': (59, 104),
'Illumina-1.3': (64, 104),
'Illumina-1.5': (67, 104)
}
def get_qual_range(qual_str):
"""
>>> get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
(68, 89)
"""
vals = [ord(c) for c in qual_str]
return min(vals), max(vals)
def get_encodings_in_range(rmin, rmax, ranges=RANGES):
valid_encodings = []
for encoding, (emin, emax) in ranges.items():
if rmin >= emin and rmax <= emax:
valid_encodings.append(encoding)
return valid_encodings
def main():
p = optparse.OptionParser(__doc__)
p.add_option("-n", dest="n", help="number of qual lines to test default:-1"
" means test until end of file or until it it possible to "
" determine a single file-type",
type='int', default=-1)
opts, args = p.parse_args()
print >>sys.stderr, "# reading qualities from stdin"
gmin, gmax = 99, 0
valid = []
for i, line in enumerate(sys.stdin):
lmin, lmax = get_qual_range(line.rstrip())
if lmin < gmin or lmax > gmax:
gmin, gmax = min(lmin, gmin), max(lmax, gmax)
valid = get_encodings_in_range(gmin, gmax)
if len(valid) == 0:
print >>sys.stderr, "no encodings for range: %s" % str((gmin, gmax))
sys.exit()
if len(valid) == 1 and opts.n == -1:
print "\t".join(valid) + "\t" + str((gmin, gmax))
sys.exit()
if opts.n > 0 and i > opts.n:
print "\t".join(valid) + "\t" + str((gmin, gmax))
sys.exit()
print "\t".join(valid) + "\t" + str((gmin, gmax))
if __name__ == "__main__":
import doctest
if doctest.testmod(optionflags=doctest.ELLIPSIS |\
doctest.NORMALIZE_WHITESPACE).failed == 0:
main()