Skip to content

Commit

Permalink
Avoid calling events in certain N stretches
Browse files Browse the repository at this point in the history
  • Loading branch information
fritzsedlazeck committed Aug 3, 2017
1 parent e1a8248 commit bc31857
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 51 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ project(Sniffles)
set( SNIF_VERSION_MINOR 0 )
IF(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Building in debug mode!")
set( SNIF_VERSION_BUILD 5-debug )
set( SNIF_VERSION_BUILD 6-debug )
else()
set( SNIF_VERSION_BUILD 5 )
set( SNIF_VERSION_BUILD 6 )
ENDIF()


Expand Down
1 change: 0 additions & 1 deletion lib/bamtools
Submodule bamtools deleted from 2d7685
102 changes: 79 additions & 23 deletions src/Alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,8 @@ void Alignment::get_coords(aln_str tmp, int & start, int &stop) {
}
void Alignment::check_entries(vector<aln_str> &entries) {

bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
bool flag =(strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);

if (flag) {
std::cout << "Nested? " << std::endl;
for (size_t i = 0; i < entries.size(); i++) {
Expand All @@ -480,16 +481,34 @@ void Alignment::check_entries(vector<aln_str> &entries) {
} else {
std::cout << "- ";
}
//sort_insert_ref(entries[i], new_entries);
}
std::cout << std::endl;

}

int chr = entries[0].RefID;
bool strand = entries[0].strand;
int strands = 1;
int valid = 1;
double read_gaps = 0;
double ref_gaps = 0;

int ref_size = 0;
int read_size = 0;

for (size_t i = 1; i < entries.size(); i++) {
if (entries[i].read_pos_stop - entries[i].read_pos_start > 200) { //only consider segments > 200bp.
valid++;
ref_size = min((int) abs((entries[i - 1].pos + entries[i - 1].length) - entries[i].pos), (int) abs(entries[i - 1].pos - (entries[i].pos + entries[i].length)));

read_size = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
if (abs(ref_size - read_size) > Parameter::Instance()->min_length) {
valid++;
}
if(flag){
cout<<"Read: "<<read_size << " Ref: "<<ref_size<< " "<<this->getName()<<std::endl;
}

if (chr != entries[i].RefID) {
return;
}
Expand All @@ -499,10 +518,14 @@ void Alignment::check_entries(vector<aln_str> &entries) {
}
}
}

if (flag) {
std::cout << "summary: " << strands << " " << valid << std::endl;
std::cout << "summary: " << strands << " " << valid << " " << std::endl;
}
if (strands < 3 || valid < 3) {
if (strands < 3 || valid < 2 ) { //check!
if (flag) {
std::cout << "Return" << std::endl;
}
return;
}

Expand All @@ -517,6 +540,9 @@ void Alignment::check_entries(vector<aln_str> &entries) {
read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start);
}

if (flag) {
std::cout << "REF DIST: " << ref_dist << " READ DIST: " << read_dist << std::endl;
}
if (abs(entries[i - 1].pos - entries[i].pos) < 100) { //inv dup:
aln_str tmp;
tmp.RefID = entries[i].RefID;
Expand Down Expand Up @@ -665,19 +691,21 @@ void Alignment::check_entries(vector<aln_str> &entries) {
entries = new_entries;
}
}*/

void Alignment::sort_insert_ref(aln_str tmp, vector<aln_str> &entries) {

for (vector<aln_str>::iterator i = entries.begin(); i != entries.end(); i++) {
if ((tmp.pos < (*i).pos)) { //insert before
entries.insert(i, tmp);
return;
}
}
entries.push_back(tmp);
}

void Alignment::sort_insert(aln_str tmp, vector<aln_str> &entries) {

for (vector<aln_str>::iterator i = entries.begin(); i != entries.end(); i++) {
/* if ((tmp.read_pos_start == (*i).read_pos_start) && (tmp.pos == (*i).pos) && (tmp.strand == (*i).strand)) { //is the same! should not happen
return;
}
if (!tmp.cigar.empty() && ((*i).read_pos_start <= tmp.read_pos_start && (*i).read_pos_stop >= tmp.read_pos_stop)) { //check for the additional introducded entries
return;
}
if (!tmp.cigar.empty() && ((*i).read_pos_start >= tmp.read_pos_start && (*i).read_pos_stop <= tmp.read_pos_stop)) { //check for the additional introducded entries
(*i) = tmp;
return;
}*/
if ((tmp.read_pos_start < (*i).read_pos_start)) { //insert before
entries.insert(i, tmp);
return;
Expand Down Expand Up @@ -708,6 +736,9 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
tmp.mq = this->getMappingQual();
tmp.pos = (long) this->getPosition(); //+get_ref_lengths(tmp.RefID, ref);
tmp.strand = getStrand();
uint32_t sv;
al->GetTag("SV", sv);
tmp.cross_N = ((sv & Ns_CLIPPED));
bool flag = strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0;

get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop);
Expand All @@ -721,10 +752,6 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
std::string cigar;
std::string chr;
bool nested = true;

uint32_t sv;
al->GetTag("SV", sv);

while (i < sa.size()) {
if (count == 0 && sa[i] != ',') {
chr += sa[i];
Expand Down Expand Up @@ -769,6 +796,9 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
//insert sorted:
includes_SV = true;
sort_insert(tmp, entries);

//al->GetTag("SV", sv); <-get that involved

} else if (tmp.mq < Parameter::Instance()->min_mq) {
nested = false;
} else { //Ignore read due to too many splits
Expand Down Expand Up @@ -1050,7 +1080,7 @@ vector<str_event> Alignment::get_events_MD(int min_mis) {
return events;
}

vector<int> Alignment::get_avg_diff(double & dist) {
vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & avg_ins) {

//computeAlignment();
//cout<<alignment.first<<endl;
Expand All @@ -1065,10 +1095,15 @@ vector<int> Alignment::get_avg_diff(double & dist) {
PlaneSweep_slim * plane = new PlaneSweep_slim();
int min_tresh = 5; //reflects a 10% error rate.
//compute the profile of differences:
double del = 0;
double ins = 0;
double mis = 0;
double length = event_aln[event_aln.size() - 1].position - event_aln[0].position;
for (size_t i = 0; i < event_aln.size(); i++) {
if (i != 0) {
dist += event_aln[i].position - event_aln[i - 1].position;
}

pair_str tmp;
tmp.position = -1;
if (event_aln[i].type == 0) {
Expand All @@ -1079,7 +1114,16 @@ vector<int> Alignment::get_avg_diff(double & dist) {
if (tmp.position != -1) { //check that its not the prev event!
mis_per_window.push_back(tmp.coverage); //store #mismatch per window each time it exceeds. (which might be every event position!)
}
if (event_aln[i].type > 0) {
avg_del += event_aln[i].type;
} else if (event_aln[i].type < 0) {
avg_ins += event_aln[i].type * -1;
}
}

avg_ins = avg_ins / length;
avg_del = avg_del / length;

dist = dist / (double) event_aln.size();
plane->finalyze();
return mis_per_window; //total_num /num;
Expand All @@ -1101,6 +1145,12 @@ vector<str_event> Alignment::get_events_Aln() {
vector<pair_str> profile;
// comp_aln = clock();

bool is_N_region = false;
uint32_t sv;
if (al->GetTag("SV", sv) && (!(sv & Ns_CLIPPED) && !(sv & FULLY_EXPLAINED))) {
is_N_region = true;
}

int noise_events = 0;
//compute the profile of differences:
for (size_t i = 0; i < event_aln.size(); i++) {
Expand All @@ -1122,7 +1172,6 @@ vector<str_event> Alignment::get_events_Aln() {
}
}


//comp_aln = clock();
int stop = 0;
size_t start = 0;
Expand Down Expand Up @@ -1197,9 +1246,6 @@ vector<str_event> Alignment::get_events_Aln() {
std::cout << "check total len: " << start << " " << stop << std::endl;
}
for (size_t k = start; k <= stop; k++) {
// if (flag) {
// cout << event_aln[k].position << " " << event_aln[k].type << endl;
// }
if (event_aln[k].type == 0) {
mismatch++;
} else if (event_aln[k].type > 0) {
Expand Down Expand Up @@ -1237,13 +1283,19 @@ vector<str_event> Alignment::get_events_Aln() {
tmp.pos = insert_max_pos;
tmp.type |= INS;
tmp.is_noise = false;
if (is_N_region && insert_max * Parameter::Instance()->avg_ins < Parameter::Instance()->min_length) {
tmp.type = 0;
}
} else if (del_max > Parameter::Instance()->min_length && (insert + insert) < del) { //deletion
if (flag) {
cout << "store DEL " << this->getName() << endl;
}
tmp.length = del_max;
tmp.type |= DEL;
tmp.is_noise = false;
if (is_N_region && del_max * Parameter::Instance()->avg_del < Parameter::Instance()->min_length) {
tmp.type = 0;
}
} else if ((mismatch + del + insert) / 2 > Parameter::Instance()->min_length) { //TODO
if (flag) {
cout << "store Noise" << endl;
Expand All @@ -1252,8 +1304,12 @@ vector<str_event> Alignment::get_events_Aln() {
tmp.type |= DEL;
tmp.type |= INV;
tmp.is_noise = true;
if (is_N_region || ((del_max> Parameter::Instance()->min_length && insert_max > Parameter::Instance()->min_length) && (del_max/insert_max)<Parameter::Instance()->min_length)) {
tmp.type = 0;
}
} else {
// std::cout<<"ERROR we did not set the type!"<<std::endl;
tmp.type = 0;
}

if (flag) {
Expand Down
5 changes: 4 additions & 1 deletion src/Alignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct aln_str{
long length;
int read_pos_start;
int read_pos_stop;
bool cross_N;
};

class Alignment {
Expand All @@ -75,6 +76,8 @@ class Alignment {
int get_id(RefVector ref, std::string chr);
vector<differences_str> summarizeAlignment();
void sort_insert(aln_str tmp, vector<aln_str> &entries);

void sort_insert_ref(aln_str tmp, vector<aln_str> &entries);
void check_entries(vector<aln_str> &entries);
bool overlapping_segments(vector<aln_str> entries);
public:
Expand Down Expand Up @@ -133,7 +136,7 @@ class Alignment {
double get_scrore_ratio();
std::string get_md();
double get_avg_indel_length_Cigar();
vector<int> get_avg_diff(double & dist);
vector<int> get_avg_diff(double & dist,double & avg_del, double & avg_len);

};

Expand Down
4 changes: 3 additions & 1 deletion src/Paramer.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Parameter {
private:
Parameter() {
window_thresh=10;//TODO check!
version="1.0.3";
version="1.0.6";
huge_ins = 2000;//TODO check??
}
~Parameter() {
Expand All @@ -49,6 +49,8 @@ class Parameter {
double error_rate;
double score_treshold;
double min_allelel_frequency;
double avg_del;
double avg_ins;
//double min_num_mismatches;

int window_thresh;
Expand Down
8 changes: 6 additions & 2 deletions src/Sniffles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@

//TODO:
// strand bias??
// I think you could make your performance on PacBio reads even better with a few modifications:
//a. The "snake" read that you show in Supplementary Figure 2.5 which limits calling of small inverted duplications likely results from an improperly constructed SMRTbell (see attached) that has adapters missing on one end. The PacBio library prep reordered a few steps (years ago now) to prevent formation of these types of molecules. So, hopefully it is no longer an issue. Even with old data, you should be able to call small inverted duplications by counting support of ZMWs not just subreads. It is improbable the same ligation error happens in two separate molecules. The ZMW vs subread counting is much like the PCR duplicate marking used in processing short read data.
//b. In pbsv, I use a simply mononucleotide consistency check to determine whether to cluster insertions from different reads as supporting the "same" events. In addition to looking at the similarity of length and breakpoints, you could measure [min(Act)+min(Cct)+min(Gct)+min(Tct) / max(Act)+max(Cct)+max(Gct)+max(Tct)] Even a lax criterion (>0.25) can avoid clustering phantom insertions (where one is say all A and the another is G+T).


Parameter* Parameter::m_pInstance = NULL;

Expand All @@ -48,7 +52,7 @@ void read_parameters(int argc, char *argv[]) {
TCLAP::ValueArg<std::string> arg_tmp_file("", "tmp_file", "path to temporary file otherwise Sniffles will use the current directory.", false, "", "string");
TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false);
TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false);
TCLAP::SwitchArg arg_std("", "ignore_std", "Ignores the std based filtering. (default: false)", cmd, false);
TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. (default: false)", cmd, false);
TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. (default: false)", cmd, false);
TCLAP::ValueArg<int> arg_cluster_supp("", "cluster_support", "Minimum number of reads supporting clustering of SV. Default: 1", false, 1, "int");
TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1).", false, 0.0, "float");
Expand All @@ -73,7 +77,7 @@ void read_parameters(int argc, char *argv[]) {

Parameter::Instance()->debug = true;
Parameter::Instance()->score_treshold = 10;
Parameter::Instance()->read_name = "m151102_061230_42286_c100922632550000001823194205121664_s1_p0/86232/0_9276";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
Parameter::Instance()->read_name = "21_43213620_-";//21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
Parameter::Instance()->bam_files.push_back(arg_bamfile.getValue());
Parameter::Instance()->min_mq = arg_mq.getValue();
Parameter::Instance()->output_vcf = arg_vcf.getValue();
Expand Down
6 changes: 3 additions & 3 deletions src/Version.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ifndef VERSION_H
#define VERSION_H

#define VERSION_MAJOR ""
#define VERSION_MINOR ""
#define VERSION_BUILD ""
#define VERSION_MAJOR "1"
#define VERSION_MINOR "0"
#define VERSION_BUILD "6"

#endif // VERSION_H

Loading

0 comments on commit bc31857

Please sign in to comment.