-
Notifications
You must be signed in to change notification settings - Fork 0
/
join_lobstr.py
executable file
·100 lines (79 loc) · 3.25 KB
/
join_lobstr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
"""Try to join candidates from Tae-min's pipeline to lobSTR's calls.
I'm not sure yet whether a missing call from lobSTR should lower our
confidence."""
import sys
from bx.intervals.intersection import IntervalTree, Interval
from collections import defaultdict
def in_tree(tree, start, end):
"""Return true if (`start`, `end`) overlaps any interval in `tree`."""
return len(tree.find(int(start), int(end))) > 0
def build_lobstr_tree(f):
"""f is an open file handle to a lobSTR genotypes.tab file"""
# Each sample has a dictionary of trees, one per chromosome
colnames = []
#chrs = defaultdict(lambda: IntervalTree())
chrs = {}
for line in f:
# Skip comments
if line.strip()[0] == '#':
continue
fields = map(str.strip, line.split('\t'))
# Skip the header
if fields[0] == 'chr':
colnames = fields
continue
# When we want to add lobSTR data to the tree, use other= to add
# the whole line.
if fields[0] not in chrs.keys():
chrs[fields[0]] = IntervalTree()
chrs[fields[0]].insert(int(fields[1]), int(fields[2]), zip(colnames, fields))
return chrs
def in_lobstr(lobstr_trees, sample, chrom, start, end):
"""Return True if the (chrom, start, end) interval intersects with a
lobSTR interval for the specified `sample`."""
try:
return in_tree(lobstr_trees[sample][chrom], start, end)
except KeyError:
warnings.warn("no tree for sample=%s, chrom=%s" % (sample, chrom),
RuntimeWarning)
return False
# For each sample, read in lobSTR's calls and build an interval tree for
# quick lookups.
def read_lobstr(samples):
trees = {}
for s in samples:
with open('lobstr/%s.genotypes.tab' % s, 'r') as f:
sys.stderr.write('building tree for ' + s + '\n')
trees[s] = build_lobstr_tree(f)
return trees
# headers should be in the table, but the format should be roughly:
# chr start end (zygosity, support)*N
# where there are N pairs of zygosity and support columns. Zygosity
# can be one of NA, hom, het or nocall. Support is NA or a list of
# x:y values, where x is the repeat unit length and y is the number
# of reads supporting that repeat unit length.
colnames = []
if __name__ == '__main__':
if len(sys.argv) != 2:
print('usage: %s tab_separated_table')
exit(1)
#sys.setrecursionlimit(10000) # For IntervalNode
samples = [ 'heart_bulk', 'neuron_2', 'neuron_3', 'neuron_51', 'neuron_6']
trees = read_lobstr(samples)
# read in the results from Tae-min's sputnik pipeline
tsv_f = open(sys.argv[1].strip(), 'r')
first_line = True
for line in tsv_f:
fields = map(str.strip, line.split('\t'))
# Handle the header: determine which column indexes contain the data
# we care about.
if first_line:
colnames = fields + [ 'in.lobstr.' + s for s in samples ]
print('\t'.join(colnames))
first_line = False
continue
d = dict(zip(colnames, fields))
found = [ str(in_lobstr(trees, s, 'chr'+d['chr'], d['start'], d['end']))
for s in samples ]
print('\t'.join(fields + found))