-
Notifications
You must be signed in to change notification settings - Fork 120
/
aligner_driver.h
247 lines (224 loc) · 8.24 KB
/
aligner_driver.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
/*
* Copyright 2012, Ben Langmead <[email protected]>
*
* This file is part of Bowtie 2.
*
* Bowtie 2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Bowtie 2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* aligner_driver.h
*
* REDUNDANT SEED HITS
*
* We say that two seed hits are redundant if they trigger identical
* seed-extend dynamic programming problems. Put another way, they both lie on
* the same diagonal of the overall read/reference dynamic programming matrix.
* Detecting redundant seed hits is simple when the seed hits are ungapped. We
* do this after offset resolution but before the offset is converted to genome
* coordinates (see uses of the seenDiags1_/seenDiags2_ fields for examples).
*
* REDUNDANT ALIGNMENTS
*
* In an unpaired context, we say that two alignments are redundant if they
* share any cells in the global DP table. Roughly speaking, this is like
* saying that two alignments are redundant if any read character aligns to the
* same reference character (same reference sequence, same strand, same offset)
* in both alignments.
*
* In a paired-end context, we say that two paired-end alignments are redundant
* if the mate #1s are redundant and the mate #2s are redundant.
*
* How do we enforce this? In the unpaired context, this is relatively simple:
* the cells from each alignment are checked against a set containing all cells
* from all previous alignments. Given a new alignment, for each cell in the
* new alignment we check whether it is in the set. If there is any overlap,
* the new alignment is rejected as redundant. Otherwise, the new alignment is
* accepted and its cells are added to the set.
*
* Enforcement in a paired context is a little trickier. Consider the
* following approaches:
*
* 1. Skip anchors that are redundant with any previous anchor or opposite
* alignment. This is sufficient to ensure no two concordant alignments
* found are redundant.
*
* 2. Same as scheme 1, but with a "transitive closure" scheme for finding all
* concordant pairs in the vicinity of an anchor. Consider the AB/AC
* scenario from the previous paragraph. If B is the anchor alignment, we
* will find AB but not AC. But under this scheme, once we find AB we then
* let B be a new anchor and immediately look for its opposites. Likewise,
* if we find any opposite, we make them anchors and continue searching. We
* don't stop searching until every opposite is used as an anchor.
*
* 3. Skip anchors that are redundant with any previous anchor alignment (but
* allow anchors that are redundant with previous opposite alignments).
* This isn't sufficient to avoid redundant concordant alignments. To avoid
* redundant concordants, we need an additional procedure that checks each
* new concordant alignment one-by-one against a list of previous concordant
* alignments to see if it is redundant.
*
* We take approach 1.
*/
#ifndef ALIGNER_DRIVER_H_
#define ALIGNER_DRIVER_H_
#include "aligner_seed2.h"
#include "simple_func.h"
#include "aln_sink.h"
/**
* Concrete subclass of DescentRootSelector. Puts a root every 'ival' chars,
* where 'ival' is determined by user-specified parameters. A root is filtered
* out if the end of the read is less than 'landing' positions away, in the
* direction of the search.
*/
class AlignerDriverRootSelector : public DescentRootSelector {
public:
AlignerDriverRootSelector(
double consExp,
const SimpleFunc& rootIval,
size_t landing)
{
consExp_ = consExp;
rootIval_ = rootIval;
landing_ = landing;
}
virtual ~AlignerDriverRootSelector() { }
virtual void select(
const Read& q, // read that we're selecting roots for
const Read* qo, // opposite mate, if applicable
bool nofw, // don't add roots for fw read
bool norc, // don't add roots for rc read
EList<DescentConfig>& confs, // put DescentConfigs here
EList<DescentRoot>& roots); // put DescentRoot here
protected:
double consExp_;
SimpleFunc rootIval_;
size_t landing_;
};
/**
* Return values from extendSeeds and extendSeedsPaired.
*/
enum {
// Candidates were examined exhaustively
ALDRIVER_EXHAUSTED_CANDIDATES = 1,
// The policy does not need us to look any further
ALDRIVER_POLICY_FULFILLED,
// We stopped because we ran up against a limit on how much work we should
// do for one set of seed ranges, e.g. the limit on number of consecutive
// unproductive DP extensions
ALDRIVER_EXCEEDED_LIMIT
};
/**
* This class is the glue between a DescentDriver and the dynamic programming
* implementations in Bowtie 2. The DescentDriver is used to find some very
* high-scoring alignments, but is additionally used to rank partial alignments
* so that they can be extended using dynamic programming.
*/
template <typename index_t>
class AlignerDriver {
public:
AlignerDriver(
double consExp,
const SimpleFunc& rootIval,
size_t landing,
bool veryVerbose,
const SimpleFunc& totsz,
const SimpleFunc& totfmops) :
sel_(consExp, rootIval, landing),
alsel_(),
dr1_(veryVerbose),
dr2_(veryVerbose)
{
totsz_ = totsz;
totfmops_ = totfmops;
}
/**
* Initialize driver with respect to a new read or pair.
*/
void initRead(
const Read& q1,
bool nofw,
bool norc,
TAlScore minsc,
TAlScore maxpen,
const Read* q2)
{
dr1_.initRead(q1, nofw, norc, minsc, maxpen, q2, &sel_);
red1_.init(q1.length());
paired_ = false;
if(q2 != NULL) {
dr2_.initRead(*q2, nofw, norc, minsc, maxpen, &q1, &sel_);
red2_.init(q2->length());
paired_ = true;
} else {
dr2_.reset();
}
size_t totsz = totsz_.f<size_t>(q1.length());
size_t totfmops = totfmops_.f<size_t>(q1.length());
stop_.init(
totsz,
0,
true,
totfmops);
}
/**
* Start the driver. The driver will begin by conducting a best-first,
* index-assisted search through the space of possible full and partial
* alignments. This search may be followed up with a dynamic programming
* extension step, taking a prioritized set of partial SA ranges found
* during the search and extending each with DP. The process might also be
* iterated, with the search being occasioanally halted so that DPs can be
* tried, then restarted, etc.
*/
int go(
const Scoring& sc,
const GFM<index_t>& gfmFw,
const GFM<index_t>& gfmBw,
const BitPairReference& ref,
DescentMetrics& met,
WalkMetrics& wlm,
PerReadMetrics& prm,
RandomSource& rnd,
AlnSinkWrap<index_t>& sink);
/**
* Reset state of all DescentDrivers.
*/
void reset() {
dr1_.reset();
dr2_.reset();
red1_.reset();
red2_.reset();
}
protected:
AlignerDriverRootSelector sel_; // selects where roots should go
DescentAlignmentSelector<index_t> alsel_; // one selector can deal with >1 drivers
DescentDriver<index_t> dr1_; // driver for mate 1/unpaired reads
DescentDriver<index_t> dr2_; // driver for paired-end reads
DescentStoppingConditions stop_; // when to pause index-assisted BFS
bool paired_; // current read is paired?
SimpleFunc totsz_; // memory limit on best-first search data
SimpleFunc totfmops_; // max # FM ops for best-first search
// For detecting redundant alignments
RedundantAlns red1_; // database of cells used for mate 1 alignments
RedundantAlns red2_; // database of cells used for mate 2 alignments
// For AlnRes::matchesRef
ASSERT_ONLY(SStringExpandable<char> raw_refbuf_);
ASSERT_ONLY(SStringExpandable<uint32_t> raw_destU32_);
ASSERT_ONLY(EList<bool> raw_matches_);
ASSERT_ONLY(BTDnaString tmp_rf_);
ASSERT_ONLY(BTDnaString tmp_rdseq_);
ASSERT_ONLY(BTString tmp_qseq_);
ASSERT_ONLY(EList<index_t> tmp_reflens_);
ASSERT_ONLY(EList<index_t> tmp_refoffs_);
};
#endif /* defined(ALIGNER_DRIVER_H_) */