diff --git a/bin/diamond b/bin/diamond index 79c3c03e..0ae20698 100755 Binary files a/bin/diamond and b/bin/diamond differ diff --git a/src/ChangeLog b/src/ChangeLog index c58ddbdf..fae77fe0 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,9 @@ +[0.7.8] +- fixed a bug that could produce an incorrect sort order of HSPs + +[0.7.7] +- fixed a number formatting error in the SAM output format + [0.7.6] - fixed a formatting error in CIGAR strings diff --git a/src/align/align_queries.h b/src/align/align_queries.h index f28cb1da..702b113f 100644 --- a/src/align/align_queries.h +++ b/src/align/align_queries.h @@ -107,9 +107,9 @@ void align_queries(const Trace_pt_buffer<_locr,_locl> &trace_pts, Output_stream* log_stream << "Processing query bin " << bin+1 << '/' << trace_pts.bins() << '\n'; task_timer timer ("Loading trace points", false); trace_pts.load(v, bin); - v.init(); timer.go("Sorting trace points"); merge_sort(v.begin(), v.end(), program_options::threads()); + v.init(); timer.go("Computing alignments"); if(ref_header.n_blocks > 1) { Align_context<_val,_locr,_locl,Temp_output_buffer<_val> > context (v, output_file); diff --git a/src/basic/const.h b/src/basic/const.h index b698df36..941773e0 100644 --- a/src/basic/const.h +++ b/src/basic/const.h @@ -25,7 +25,7 @@ struct Const { enum { - build_version = 56, + build_version = 57, build_compatibility = 52, daa_version = 0, seedp_bits = 10, @@ -46,7 +46,7 @@ struct Const }; -const char* Const::version_string = "0.7.7"; +const char* Const::version_string = "0.7.8"; const char* Const::program_name = "diamond"; const char* Const::id_delimiters = " \a\b\f\n\r\t\v"; diff --git a/src/basic/options.cpp b/src/basic/options.cpp index 96fa122a..e8d0e6cc 100644 --- a/src/basic/options.cpp +++ b/src/basic/options.cpp @@ -82,6 +82,7 @@ double toppercent; string daa_file; string output_format; bool forwardonly; +unsigned fetch_size; Aligner_mode aligner_mode; Command command; diff --git a/src/basic/options.h b/src/basic/options.h index 49a168d7..71f3273c 100644 --- a/src/basic/options.h +++ b/src/basic/options.h @@ -83,6 +83,7 @@ namespace program_options extern string output_format; extern string output_file; extern bool forwardonly; + extern unsigned fetch_size; typedef enum { fast=0, sensitive=1, very_sensitive=2 } Aligner_mode; extern Aligner_mode aligner_mode; diff --git a/src/basic/setup.h b/src/basic/setup.h index d105387d..bcd1e1fd 100644 --- a/src/basic/setup.h +++ b/src/basic/setup.h @@ -129,15 +129,10 @@ void setup_search_params(pair query_len_bounds, size_t chunk_db_l const double b = po::min_bit_score == 0 ? score_matrix::get().bitscore(po::max_evalue, ref_header.letters, query_len_bounds.first) : po::min_bit_score; - if(query_len_bounds.second <= 40) { - po::set_option(po::min_identities, 10u); - po::set_option(po::min_ungapped_raw_score, score_matrix::get().rawscore(std::min(27.0, b))); - } else { - po::set_option(po::min_identities, 9u); - po::set_option(po::min_ungapped_raw_score, score_matrix::get().rawscore(std::min(23.0, b))); - } + po::set_option(po::min_identities, 18u); + po::min_ungapped_raw_score = score_matrix::get().rawscore(std::min(po::min_ungapped_raw_score == 0 ? 19.0 : po::min_ungapped_raw_score, b)); - if(query_len_bounds.second <= 80) { + if(query_len_bounds.second <= 128) { const int band = po::read_padding<_val>(query_len_bounds.second); po::set_option(po::window, (unsigned)(query_len_bounds.second + band)); po::set_option(po::hit_band, band); @@ -145,9 +140,10 @@ void setup_search_params(pair query_len_bounds, size_t chunk_db_l } else { po::set_option(po::window, 40u); po::set_option(po::hit_band, 5); - po::set_option(po::min_hit_score, score_matrix::get().rawscore(std::min(29.0, b))); + po::min_hit_score = score_matrix::get().rawscore(std::min(po::min_hit_score == 0 ? 19.0 : po::min_hit_score, b)); } log_stream << "Query len bounds " << query_len_bounds.first << ' ' << query_len_bounds.second << endl; + log_stream << "Minimum bit score = " << b << endl; log_stream << "Search parameters " << po::min_ungapped_raw_score << ' ' << po::min_hit_score << ' ' << po::hit_cap << endl; } diff --git a/src/basic/shape.h b/src/basic/shape.h index b8f2cce8..3ea3bff6 100644 --- a/src/basic/shape.h +++ b/src/basic/shape.h @@ -97,6 +97,7 @@ struct shape unsigned r = Reduction<_val>::reduction(l); f += background_freq[r]; s *= Reduction<_val>::reduction.size(); + //s *= 5; s += uint64_t(r); } if(use_seed_freq<_val>() && f > program_options::max_seed_freq) return false; diff --git a/src/main.cpp b/src/main.cpp index 209195aa..9aa5ba84 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -105,12 +105,14 @@ int main(int ac, const char* av[]) ("window,w", po::value(&program_options::window)->default_value(0), "window size for local hit search") ("xdrop", po::value(&program_options::xdrop)->default_value(20), "xdrop for ungapped alignment") ("gapped-xdrop,X", po::value(&program_options::gapped_xdrop)->default_value(20), "xdrop for gapped alignment in bits") - ("ungapped-score", po::value(&program_options::min_ungapped_raw_score)->default_value(0), "minimum ungapped raw alignment score to continue local extension") + ("ungapped-score", po::value(&program_options::min_ungapped_raw_score)->default_value(0), "minimum raw alignment score to continue local extension") ("hit-band", po::value(&program_options::hit_band)->default_value(0), "band for hit verification") ("hit-score", po::value(&program_options::min_hit_score)->default_value(0), "minimum score to keep a tentative alignment") ("band", po::value(&program_options::padding)->default_value(0), "band for dynamic programming computation") ("shapes,s", po::value(&program_options::shapes)->default_value(0), "number of seed shapes (0 = all available)") - ("index-mode", po::value(&program_options::index_mode)->default_value(0), "index mode (1=4x12, 2=16x9)"); + ("index-mode", po::value(&program_options::index_mode)->default_value(0), "index mode (1=4x12, 2=16x9)") + ("fetch-size", po::value(&program_options::fetch_size)->default_value(4096), "trace point fetch size") + ; //("no-traceback,r", "disable alignment traceback"); //("compress-temp", po::value(&program_options::compress_temp)->default_value(0), "compression for temporary output files (0=none, 1=gzip)"); @@ -168,10 +170,14 @@ int main(int ac, const char* av[]) } else if (program_options::command == program_options::makedb && vm.count("in") && vm.count("db")) { if(vm.count("block-size") == 0) program_options::chunk_size = 2; +#ifdef EXTRA if(program_options::db_type == "nucl") make_db(Nucleotide()); - else + else if(program_options::db_type == "prot") make_db(Amino_acid()); +#else + make_db(Amino_acid()); +#endif } else if ((program_options::command == program_options::blastp || program_options::command == program_options::blastx #ifdef EXTRA diff --git a/src/output/join_blocks.h b/src/output/join_blocks.h index 76a03529..9e85d3b7 100644 --- a/src/output/join_blocks.h +++ b/src/output/join_blocks.h @@ -37,7 +37,7 @@ void join_blocks(unsigned ref_blocks, DAA_output &master_out, const vectornext(r, std::numeric_limits::max())) + if(files.back()->next(r, std::numeric_limits::max(), std::numeric_limits::max())) records.push_back(r); } std::make_heap(records.begin(), records.end()); @@ -63,7 +63,7 @@ void join_blocks(unsigned ref_blocks, DAA_output &master_out, const vector 0 && b == block && next.info_.subject_id == subject; if(program_options::output_range(n_target_seq, next.info_.score, top_score) || same_subject) { - //printf("q=%u s=%u n=%u ss=%u\n",query, next.info_.subject_id, n_target_seq, same_subject); + //printf("q=%u s=%u n=%u ss=%u\n",query, next.info_.subject_id, n_target_seq, same_subject, next.info_.score); DAA_output::write_record(buf, next.info_); statistics.inc(Statistics::MATCHES); if(!same_subject) { @@ -76,7 +76,7 @@ void join_blocks(unsigned ref_blocks, DAA_output &master_out, const vectornext(r, subject)) { + if(files[b]->next(r, subject, query)) { records.push_back(r); std::push_heap(records.begin(), records.end()); } diff --git a/src/output/output_file.h b/src/output/output_file.h index e61128b8..5c192109 100644 --- a/src/output/output_file.h +++ b/src/output/output_file.h @@ -35,16 +35,18 @@ struct Block_output : public Buffered_file bool same_subject_; Intermediate_record info_; bool operator<(const Iterator &rhs) const - { return info_.query_id > rhs.info_.query_id || (info_.query_id == rhs.info_.query_id && (!same_subject_ || info_.score < rhs.info_.score)); } + { return info_.query_id > rhs.info_.query_id || + (info_.query_id == rhs.info_.query_id && (rhs.same_subject_ || + (!rhs.same_subject_ && info_.score < rhs.info_.score))); } }; - bool next(Iterator &it, unsigned subject) + bool next(Iterator &it, unsigned subject, unsigned query) { if(this->eof()) return false; it.info_.read(*this); it.block_ = block_; - it.same_subject_ = it.info_.subject_id == subject; + it.same_subject_ = it.info_.subject_id == subject && it.info_.query_id == query; return true; } diff --git a/src/search/trace_pt_buffer.h b/src/search/trace_pt_buffer.h index c6c38811..329038ae 100644 --- a/src/search/trace_pt_buffer.h +++ b/src/search/trace_pt_buffer.h @@ -44,6 +44,34 @@ struct Trace_pt_list : public vector > void init() { pos_ = this->begin(); + p_.clear(); + p_.push_back(0); + idx_ = 0; + const unsigned c = query_contexts(); + /*typename vector >::iterator i = this->begin()+program_options::fetch_size; + for(; i < this->end(); i=std::min(i+program_options::fetch_size, this->end())) { + const unsigned q = i->query_/c; + for(; iend() && i->query_/c == q; ++i); + //printf("%lu %u %u\n", i-this->begin(), i->query_/c, q); + //printf("%lu\n",i - this->begin()); + //std::terminate(); + p_.push_back(i - this->begin()); + }*/ + /*typename vector >::iterator i = this->begin(); + unsigned total=0,count=1; + for(; i < this->end();) { + unsigned n=0; + const unsigned min_size = 4*total/count/5 + 1; + for(;iend() && nquery_/c; + for(; iend() && i->query_/c == q; ++i) + ++n; + } + ++count; + total += n; + p_.push_back(i - this->begin()); + } + p_.push_back(i - this->begin());*/ } struct Query_range { @@ -52,15 +80,28 @@ struct Trace_pt_list : public vector > { } bool operator()() { + begin = parent_.pos_; - end = std::min(begin + 4096, parent_.end()); + //end = std::min(begin + 4096, parent_.end()); + end = std::min(begin + program_options::fetch_size, parent_.end()); if(end >= parent_.end()) return false; const unsigned c = query_contexts(), q = end->query_/c; for(; endquery_/c == q; ++end); + //printf("%lu %u %u\n", end-parent_.begin(), end->query_/c, q); parent_.pos_ = end; + //printf("%lu\n",end - parent_.begin()); + //printf("%lu %lu\n",begin-parent_.begin(),end-parent_.begin()); return end < parent_.end(); } + /*bool operator()() + { + begin = parent_.begin()+parent_.p_[parent_.idx_]; + end = parent_.begin()+parent_.p_[parent_.idx_+1]; + printf("%lu %lu %lu\n", parent_.p_[parent_.idx_], parent_.p_[parent_.idx_+1], parent_.p_[parent_.idx_+1]-parent_.p_[parent_.idx_]); + ++parent_.idx_; + return parent_.idx_ < parent_.p_.size()-1; + }*/ typename Trace_pt_list::iterator begin, end; private: Trace_pt_list &parent_; @@ -69,6 +110,9 @@ struct Trace_pt_list : public vector > { return Query_range (*this); } private: typename vector >::iterator pos_; + vector p_; + unsigned idx_; }; #endif /* TRACE_PT_BUFFER_H_ */ +