Skip to content

Commit

Permalink
fix a bug on merging; add the alignment algorithm used by AGE
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangzhen committed Dec 16, 2015
1 parent 37b8632 commit aed7b2b
Show file tree
Hide file tree
Showing 5 changed files with 200 additions and 8 deletions.
20 changes: 20 additions & 0 deletions Helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,26 @@ void cluster(const std::vector<T>& orig, std::vector<std::vector<T> >& clusters,
if (!buffer.empty()) clusters.push_back(buffer);
}

template<class T, class Compare>
void merge(const std::vector<T>& orig, std::vector<T>& results, Compare comp) {
std::vector<bool> removed(orig.size(), false);

for (size_t i = 0; i < orig.size() - 1; ++i) {
for (size_t j = i + 1; j < orig.size(); ++j) {
if (!removed[j] && comp(orig[i], orig[j])) {
removed[j] = true;
}
}
}

for (size_t i = 0; i < removed.size(); ++i) {
if (!removed[i]) {
results.push_back(orig[i]);
}
}

}

namespace Helper {
std::string getReferenceName(BamTools::BamReader& reader, int referenceId);
const int SVLEN_THRESHOLD = -50;
Expand Down
136 changes: 136 additions & 0 deletions Thirdparty/overlapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ bool SequenceOverlap::isValid() const
return !cigar.empty() && match[0].isValid() && match[1].isValid();
}

bool SequenceOverlap::isQualified(int minOverlap, double minIdentity) const
{
if (getOverlapLength() >= minOverlap &&
getPercentIdentity() >= minIdentity * 100) {
return true;
}
return false;
}

//
double SequenceOverlap::getPercentIdentity() const
{
Expand Down Expand Up @@ -1178,3 +1187,130 @@ std::string Overlapper::compactCigar(const std::string& ecigar)
compact_cigar << curr_run << curr_symbol;
return compact_cigar.str();
}


SequenceOverlap Overlapper::ageAlign(const std::string &s1, const std::string &s2, const ScoreParam &score_param)
{
SequenceOverlap output;

const int NONE = 0;
const int DIAGONAL = 1;
const int VERTICAL = 2;
const int HORIZONTAL = 3;

int orientation_table[] = {NONE, DIAGONAL, VERTICAL, HORIZONTAL};
int orientation_table_m[] = {NONE, HORIZONTAL, VERTICAL};

output.length[0] = s1.size();
output.length[1] = s2.size();

size_t num_columns = s1.size() + 1;
size_t num_rows = s2.size() + 1;

DPMatrix S(num_rows, DPCells(num_columns));
DPMatrix S_backtrace(num_rows, DPCells(num_columns));
DPMatrix S_lower(num_rows, DPCells(num_columns));
DPMatrix S_upper(num_rows, DPCells(num_columns));

// calculate score matrix
for (size_t i = 1; i < num_rows; ++i) {
for (size_t j = 1; j < num_columns; ++j) {
S_lower[i][j] = std::max(S_lower[i-1][j] - score_param.gap, S[i-1][j] - score_param.gap_start);
S_upper[i][j] = std::max(S_upper[i][j-1] - score_param.gap, S[i][j-1] - score_param.gap_start);
int middle_scores[] = {0, S[i-1][j-1] + score_param.matchChar(s1[j-1], s2[i-1]), S_lower[i][j], S_upper[i][j]};
const int N = sizeof(middle_scores) / sizeof(int);
auto max_it = std::max_element(middle_scores, middle_scores + N);
S[i][j] = *max_it;
S_backtrace[i][j] = orientation_table[std::distance(middle_scores, max_it)];
}
}

DPMatrix M_backtrace(num_rows, DPCells(num_columns));

for (size_t i = 1; i < num_rows; ++i) {
M_backtrace[i][0] = VERTICAL;
}

for (size_t j = 1; j < num_columns; ++j) {
M_backtrace[0][j] = HORIZONTAL;
}

// calculate maximum matrix
for (size_t i = 1; i < num_rows; ++i) {
for (size_t j = 1; j < num_columns; ++j) {
int scores[] = {S[i][j], S[i][j-1], S[i-1][j]};
const int N = sizeof(scores) / sizeof(int);
auto max_it = std::max_element(scores, scores + N);
S[i][j] = *max_it;
M_backtrace[i][j] = orientation_table_m[std::distance(scores, max_it)];
}
}

/*
for (size_t i = 0; i < num_rows; ++i) {
for (size_t j = 0; j < num_columns; ++j) {
std::cout << M_backtrace[i][j] << ' ';
}
std::cout << std::endl;
}
*/

int max_score = S[num_rows-1][num_columns-1];
int max_row_index = 0;
int max_column_index = 0;

for (size_t i = 0; i < num_rows; ++i) {
for (size_t j = 0; j < num_columns; ++j) {
if (M_backtrace[i][j] == NONE && S[i][j] == max_score) {
max_row_index = i;
max_column_index = j;
goto theEnd;
}
}
}

theEnd:
output.score = max_score;
output.match[0].end = max_column_index - 1;
output.match[1].end = max_row_index - 1;

#ifdef DEBUG_OVERLAPPER
printf("Endpoints selected: (%d %d) with score %d\n", output.match[0].end, output.match[1].end, output.score);
#endif

output.edit_distance = 0;
output.total_columns = 0;

int i = max_row_index;
int j = max_column_index;
std::string cigar;

while (S_backtrace[i][j] != NONE && i*j !=0) {
if (S_backtrace[i][j] == VERTICAL) {
cigar.push_back('I');
output.edit_distance += 1;
i--;
} else if(S_backtrace[i][j] == HORIZONTAL) {
cigar.push_back('D');
output.edit_distance += 1;
j--;
} else {
if (s1[j-1] != s2[i-1]) {
output.edit_distance += 1;
}
cigar.push_back('M');
i--;
j--;
}
output.total_columns += 1;
}

output.match[0].start = j;
output.match[1].start = i;

std::reverse(cigar.begin(), cigar.end());
assert(!cigar.empty());
output.cigar = compactCigar(cigar);

return output;
}
25 changes: 25 additions & 0 deletions Thirdparty/overlapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ struct SequenceOverlap
// Check that the record is properly formed
bool isValid() const;

// added by Zhen Zhang
bool isQualified(int minOverlap, double minIdentity) const;

// Return padded versions of the matching portions of the strings
void makePaddedMatches(const std::string& s1, const std::string& s2,
std::string* p1, std::string* p2) const;
Expand Down Expand Up @@ -134,6 +137,26 @@ struct OverlapperParams
int mismatch_penalty;
};

struct ScoreParam
{

ScoreParam(int match, int mismatch, int gap, int gap_start = 0) :
match(match), mismatch(mismatch), gap(gap), gap_start(gap_start) {

}

int matchChar(char a, char b) const {
if (a == b) return match;
return mismatch;
}

int match;
int mismatch;
int gap;
int gap_start;

};

// Global variables
extern OverlapperParams default_params; // { 2, -5, -3 };
extern OverlapperParams ungapped_params; // { 2, -10000, -3 };
Expand All @@ -151,6 +174,8 @@ SequenceOverlap computeOverlapSG(const std::string& s1, const std::string& s2, c
SequenceOverlap computeOverlapSW(const std::string& s1, const std::string& s2, int minOverlap, double minIdentity, const OverlapperParams params = default_params);
SequenceOverlap computeOverlapSW2(const std::string& s1, const std::string& s2, int minOverlap, double minIdentity, const OverlapperParams params = default_params);

SequenceOverlap ageAlign(const std::string& s1, const std::string& s2, const ScoreParam& score_param);

// Extend a match between s1 and s2 into a full overlap using banded dynamic programming.
// start_1/start_2 give the starting positions of the current partial alignment. These coordinates
// are used to estimate where the overlap begins. The estimated alignment is refined by calculating
Expand Down
4 changes: 4 additions & 0 deletions clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ Deletion ForwardBClip::call(FaidxWrapper &faidx, const std::vector<TargetRegion>
int start2 = delta > 0 ? leftBp + delta : leftBp;
int end1 = delta > 0 ? rightBp : rightBp + delta;
int end2 = delta > 0 ? rightBp + delta : rightBp;
// if (start2 == 27688481) {
// cout << sequence << endl;
// cout << (*it).sequence(faidx) << endl;
// }
if (len > Helper::SVLEN_THRESHOLD) continue;
return Deletion((*it).referenceName, start1, start2, end1, end2, len, getType());
}
Expand Down
23 changes: 15 additions & 8 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,12 @@ _INITIALIZE_EASYLOGGINGPP
// Main
//
int main(int argc, char *argv[]) {
LINFO << "This is my first log";

/*
SequenceOverlap result = Overlapper::ageAlign("ACGGGGACT", "ACGTTACT", ScoreParam(1, -1, 2, 4));
LINFO << result;
return 0;
*/

parseOptions(argc, argv);

Expand Down Expand Up @@ -207,15 +212,17 @@ int main(int argc, char *argv[]) {
std::sort(deletions.begin(), deletions.end());
deletions.erase(std::unique(deletions.begin(), deletions.end()), deletions.end());

std::vector<std::vector<Deletion> > delClusters;
// std::vector<std::vector<Deletion> > delClusters;

cluster(deletions, delClusters,
[](const Deletion& d1, const Deletion& d2){ return d1.overlaps(d2); });
// cluster(deletions, delClusters,
// [](const Deletion& d1, const Deletion& d2){ return d1.overlaps(d2); });

std::vector<Deletion> finalDels;
finalDels.reserve(delClusters.size());
for (auto &clu: delClusters) {
finalDels.push_back(clu[0]);
merge(deletions, finalDels,
[](const Deletion& d1, const Deletion& d2){ return d1.overlaps(d2); });
// finalDels.reserve(delClusters.size());
// for (auto &clu: delClusters) {
// finalDels.push_back(clu[0]);
/*
if (clu.size() == 1) finalDels.push_back(clu[0]);
else {
Expand All @@ -229,7 +236,7 @@ int main(int argc, char *argv[]) {
finalDels.push_back(d);
}
*/
}
// }
delete pTimer;

output(opt::outFile, finalDels);
Expand Down

0 comments on commit aed7b2b

Please sign in to comment.