-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_blat.py
executable file
·65 lines (44 loc) · 2.57 KB
/
parse_blat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import sys
import collections
from optparse import OptionParser
def cb(option, opt_str, value, parser):
args=[]
for arg in parser.rargs:
if arg[0] != "-":
args.append(arg)
else:
del parser.rargs[:len(args)]
break
if getattr(parser.values, option.dest):
args.extend(getattr(parser.values, option.dest))
setattr(parser.values, option.dest, args)
parser = OptionParser()
parser.add_option("--blat_files", action="callback", callback=cb, dest="blat_files", help="blat files to parse")
parser.add_option("--gap_size", dest="gap_size", type = 'int', help="only alignments with gap size or less gaps will be selected")
parser.add_option("--precent_alignment", '-p', dest="percent_aligned", type = 'float', help = "pairs of alignments higher than this will be selected",)
parser.add_option("--length", "-l", dest="length", type = 'int', help = "Only blatted sequences large than length will be selected")
parser.add_option("--padding", dest="padding", type = 'int', help = "Padding for the outputed bed file (so we can find an ORF)")
(options, args) = parser.parse_args()
psl = collections.namedtuple('psl', 'matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, tStart, tEnd, blockCount,blockSizes, qStarts, tStarts, qSeqs, tSeqs')
#parses out header
pad = options.padding
#print blat_files
for blat_file in options.blat_files:
print >> sys.stderr, blat_file
chr = blat_file.strip().split(".")[1]
blat_file = open(blat_file)
blat_file.next()
blat_file.next().strip().split("\t")
blat_file.next().strip().split("\t")
blat_file.next().strip().split("\t")
blat_file.next().strip().split("\t")
for line in blat_file:
psl_result = psl._make(line.strip().split("\t"))
#print only if there are no gaps and more than 90% of the target and query match
start, stop = [int(x) for x in psl_result.qName.split(",")[1].split("_")[0].split("-")]
len = (stop - start) / 3 #convert to NT lengths
aligned = int(psl_result.matches) + int(psl_result.misMatches) * 1.0
percent_aligned = aligned / len
if percent_aligned > options.percent_aligned and len > options.length and int(psl_result.tBaseInsert) < options.gap_size:
start = int(psl_result.tStart) + 1
print "\t".join([chr, str(start - pad), str(int(psl_result.tEnd) + pad), psl_result.qName, "0", psl_result.strand[1]])