-
Notifications
You must be signed in to change notification settings - Fork 29
/
filter_gap_SVs.py
136 lines (105 loc) · 4.89 KB
/
filter_gap_SVs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
import re
from intervaltree import IntervalTree
from ragoo_utilities.SeqReader import SeqReader
class BaseSequence(object):
def __init__(self, in_sequence):
if not isinstance(in_sequence, str):
raise AttributeError('Only a string can be used to instantiate this class.')
self.sequence = in_sequence.upper()
class GapSequence(BaseSequence):
def get_gap_coords(self):
""" Find all of the gap string indices for this sequence. """
return re.finditer(r'N+', self.sequence)
def make_gaps_tree(in_file):
# A dictionary to store an interval tree for each chromosome header.
all_trees = dict()
x = SeqReader(in_file)
if in_file.endswith(".gz"):
for header, sequence in x.parse_gzip_fasta():
# Remove the greater than sign and only get first token if delimited by spaces
header = header[1:].split(' ')[0]
all_trees[header] = IntervalTree()
gap_sequence = GapSequence(sequence)
all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
for i in all_coordinates:
all_trees[header][i[0]:i[1]] = i
else:
for header, sequence in x.parse_fasta():
# Remove the greater than sign and only get first token if delimited by spaces
header = header[1:].split(' ')[0]
all_trees[header] = IntervalTree()
gap_sequence = GapSequence(sequence)
all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
for i in all_coordinates:
all_trees[header][i[0]:i[1]] = i
return all_trees
def get_query_bed_coords(in_line):
c_loc = [m.start() for m in re.finditer(':', in_line)]
c1, c2 = c_loc[-1], c_loc[-2]
header = in_line[:c2]
L1 = in_line[c2+1:c1].split('-')
start, stop = int(L1[0]), int(L1[1])
return header, start, stop
def make_svs_bed(in_trees_q, in_trees_r):
final_lines = []
with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
# Make a new header column for the field I am about to create
header = f.readline().rstrip() + '\tquery_gap_ovlp'
final_lines.append(header)
# Calculate the percentage overlap for each structural variant.
for line in f:
pct_ovlp = 0
ovlp = 0
L1 = line.rstrip().split('\t')
head, start, end = get_query_bed_coords(L1[9])
query = in_trees_q[head][start:end]
if query:
for interval in query:
gap_start, gap_end= interval.begin, interval.end
# Calculate the amount these to intervals overlap
ovlp += max(0, min(end, gap_end) - max(start, gap_start))
pct_ovlp = ovlp/(end-start)
# Add the new value to the original line
L1.append(pct_ovlp)
L1 = [str(i) for i in L1]
final_lines.append('\t'.join(L1))
with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
out_file.write('\n'.join(final_lines))
# Now repeat but with respect to the reference
final_lines = []
with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
# Make a new header column for the field I am about to create
header = f.readline().rstrip() + '\tref_gap_ovlp'
final_lines.append(header)
# Calculate the percentage overlap for each structural variant.
for line in f:
pct_ovlp = 0
ovlp = 0
L1 = line.rstrip().split('\t')
head, start, end = L1[0], int(L1[1]), int(L1[2])
query = in_trees_r[head][start:end]
if query:
for interval in query:
gap_start, gap_end = interval.begin, interval.end
# Calculate the amount these to intervals overlap
ovlp += max(0, min(end, gap_end) - max(start, gap_start))
pct_ovlp = ovlp / (end - start)
# Add the new value to the original line
L1.append(pct_ovlp)
L1 = [str(i) for i in L1]
final_lines.append('\t'.join(L1))
with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
out_file.write('\n'.join(final_lines))
def main():
import argparse
parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap')
parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file")
args = parser.parse_args()
ref_file = args.ref_file
# Make a new bed file w.r.t the query
all_trees_q = make_gaps_tree('../ragoo.fasta')
all_trees_r = make_gaps_tree(ref_file)
make_svs_bed(all_trees_q, all_trees_r)
if __name__ == "__main__":
main()