-
Notifications
You must be signed in to change notification settings - Fork 0
/
HA.py
107 lines (88 loc) · 2.72 KB
/
HA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import argparse
from src import AlignedDB
from src import AlignedDBPreprocessor
from src import ILPInputPreprocessor
from src import ILPMinimizer
from src import Util
parser = argparse.ArgumentParser(
description="Simple maximum parsimony Haplotype Assembler")
parser.add_argument(
"--ref",
help="path to reference genome",
type=str
)
parser.add_argument(
"--reads",
nargs="+",
help="path to origin read",
type=str
)
parser.add_argument(
"--format",
help="reads file format",
type=str,
default="fasta"
)
parser.add_argument(
"--k",
help="length of k-mer for De Bruijn graph",
default=61,
type=int
)
parser.add_argument(
"--o",
help="path to my_output file",
type=str,
default="./haplotypes"
)
if __name__ == '__main__':
args = vars(parser.parse_args())
path_to_reference = args["ref"]
paths_to_reads = args["reads"]
file_extension = args["format"]
k = args['k']
out_path = args['o']
# Build aligned De Bruijn graph by reference genome and reads
aligned_db = AlignedDB.AlignedDB(
path_to_reference=path_to_reference,
path_to_reads=paths_to_reads,
read_format=file_extension,
k_mer_len=k
)
aligned_db.build()
# Make normalization of aligned De Bruijn graph
prep = AlignedDBPreprocessor.AlignedDBPreprocessor(aligned_db, 1 - 10**-4)
prep.normalize_parallel()
prep.mean_by_path_parallel()
prep.eriksson_clear()
print(prep.aligned_db.number_of_edges())
aligned_db = prep.aligned_db
# Find optimized system
ilp_prep = ILPInputPreprocessor.DataPreprocessor(aligned_db)
haplotypes, _ = ilp_prep.find_haplotypes()
print(len(ilp_prep.haplotypes_edges))
# Find optimal frequencies
minimizer = ILPMinimizer.ILPMinimizer(
aligned_db, ilp_prep.haplotypes_edges)
print(prep.eriksson_threshold)
minimizer.find_alpha(prep.eriksson_threshold)
_, result = minimizer.find_frequencies()
indexes = dict()
for k, v in ilp_prep.haplotypes_edges.items():
indexes[k] = (v[0][0].pos, v[-1][-1].pos + aligned_db.k)
assembled = []
for h, f in result.items():
if f > prep.eriksson_threshold / 10:
left, right = indexes[h][0], indexes[h][1]
h = aligned_db.ref[:left] + h + aligned_db.ref[right:]
assembled.append((h, f))
result = Util.get_normalize_pair_list(assembled)
with open('{}.fa'.format(out_path), 'w') as haplotypes_file:
for idx, (haplotype, frequency) in enumerate(result):
haplotypes_file.write(
'>{} freq={}\n{}\n'.format(
str(idx),
frequency,
haplotype
)
)