-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakeGraphFromOverlap.py
executable file
·136 lines (107 loc) · 4.32 KB
/
makeGraphFromOverlap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
'''
Given the ground truth and overlap files, construct a graph-tool directed
graph where the vertices are reads. Two reads are connected by a weighted
edge if they share some seeds where the weight is the number of shared seeds.
The direction of the edge i->j means tail of read i overlap with head of read j.
Additional edges may be added according to the ground truth with weight 0.
To ignore such edges, apply an edge filter with edge property 'type' (see below).
Vertices have an int property map 'id' for their corresponding read id.
Edges have two property maps, {'weight', int} and {'type', 'int8_t'},
where type=0 means this edge is a false positive overlapping pair;
type=1 means the two reads indeed overlap; type=2 means the overlapping pair
(i,j) is irreducible (i.e., there is no read k such that tail of i overlap
with head of k, and tail of k overlap with head of j).
'''
import sys
import os.path
import itertools
import numpy as np
from graph_tool.all import *
def main(argc, argv):
dir = "sample-reads"
header_ext = "header-sorted"
irr_ext = "irr-edges-directed"
true_ext = "truepairs-directed"
found_ext = "all-pair"
if argc < 3 or argc > 4:
example = "sim-e-coli-pb-le20k-nosub"
print("Usage: makeGraphFromOverlap.py <sample-basename> <overlap-basename> [overlap-extension]")
print(f'Example: makeGraphFromOverlap.py {example} {example}-n60-k42-t0/unionPos 10')
print(f'Will use files {dir}/{example}.{{{header_ext},{true_ext},{irr_ext}}} and {dir}/{example}-n60-k42-t0/unionPos.{found_ext}10')
exit(1)
output_filename = f'{dir}/{argv[2]}.graphml'
if argc == 4:
found_ext += argv[3]
output_filename += argv[3]
#check all needed files exist
header_name = f'{dir}/{argv[1]}.{header_ext}'
if not os.path.isfile(header_name):
print(f'Cannot read node file: {header_name}')
exit(1)
true_name = f'{dir}/{argv[1]}.{true_ext}'
if not os.path.isfile(true_name):
print(f'Cannot read true pair file: {true_name}')
exit(1)
irr_name = f'{dir}/{argv[1]}.{irr_ext}'
if not os.path.isfile(irr_name):
print(f'Cannot read irr pair file: {irr_name}')
exit(1)
found_name = f'{dir}/{argv[2]}.{found_ext}'
if not os.path.isfile(found_name):
print(f'Cannot read found all file: {found_name}')
exit(1)
#generate graph-tool graph
g = Graph()
g.set_fast_edge_removal(True) #add edge becomes O(1) with O(E) additional space
#create nodes from the sorted header file
try:
with open(header_name) as f:
vmap = {int(read):int(idx) for read,idx in
zip([line.split()[0][1:] for line in f.readlines()],
itertools.count(0,1))}
except OSError as e:
print(e.strerror)
exit(1)
g.add_vertex(len(vmap))
g.vertex_properties["id"] = g.new_vertex_property("int")
# dict preserves input order since 3.7, otherwise use
# g.vp.id.a = list(map(lambda kv: kv[0], sorted(vmap.items(), key=lambda kv: kv[1])))
g.vp.id.a = list(vmap.keys())
#create all edges from the found_all file
try:
with open(found_name) as f:
g.add_edge_list(list(map(lambda x:(vmap[int(x[0])],
vmap[int(x[1])],
int(x[2])),
(line.split() for line in f.readlines()))),
eprops=[("weight", "int")])
except OSError as e:
print(e.strerror)
exit(1)
# 0 - wrong, 1 - correct, 2 - irreducible
g.ep["type"] = g.new_edge_property("int8_t")
try:
with open(true_name) as f:
for line in f.readlines():
l = line.split()
s = vmap[int(l[0])]
t = vmap[int(l[1])]
e = g.edge(s, t)
if e is None:
e = g.add_edge(s, t)
g.ep.type[e] = 1
with open(irr_name) as f:
for line in f.readlines():
l = line.split()
s = vmap[int(l[0])]
t = vmap[int(l[1])]
e = g.edge(s, t) # must exist as all true edges have been added
g.ep.type[e] = 2
except OSError as e:
print(e.strerror)
exit(1)
g.save(output_filename, fmt='graphml')
# end of main
if __name__ == "__main__":
sys.exit(main(len(sys.argv), sys.argv))