diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f5ae18c..6346ac04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ add_executable(diamond src/run/main.cpp src/run/mapper.cpp src/data/count_approximate.cpp src/data/index.cpp + src/data/frequent_seeds.cpp ) target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/build_simple.sh b/build_simple.sh index 1299b112..49979af2 100755 --- a/build_simple.sh +++ b/build_simple.sh @@ -32,4 +32,5 @@ g++ -DNDEBUG -O3 -mssse3 -Wno-deprecated-declarations -std=gnu++98 -static \ src/run/mapper.cpp \ src/data/count_approximate.cpp \ src/data/index.cpp \ + src/data/frequent_seeds.cpp \ -lz -lpthread -o diamond diff --git a/src/ChangeLog b/src/ChangeLog index 41dc4d32..34bd3cf0 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,6 @@ +[0.8.13] +- fixed a compiler error for GCC 4.1 and 4.2 + [0.8.12] - changed database format - block size parameter is set only for the alignment commands diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp index ad2c362f..9c385a6d 100644 --- a/src/basic/basic.cpp +++ b/src/basic/basic.cpp @@ -23,7 +23,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P #include "statistics.h" #include "sequence.h" -const char* Const::version_string = "0.8.12"; +const char* Const::version_string = "0.8.13"; const char* Const::program_name = "diamond"; const char* Const::id_delimiters = " \a\b\f\n\r\t\v"; diff --git a/src/basic/config.cpp b/src/basic/config.cpp index 7c86412d..93f2d1e5 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -92,6 +92,7 @@ Config::Config(int argc, const char **argv) ("seed-freq", 0, "maximum seed frequency", max_seed_freq, -15.0) ("run-len",'l', "mask runs between stop codons shorter than this length", run_len) ("max-hits",'C', "maximum number of hits to consider for one seed", hit_cap) + ("freq-sd", 0, "number of standard deviations for ignoring frequent seeds", freq_sd, 25.0) ("id2", 0, "minimum number of identities for stage 1 hit", min_identities) ("window", 'w', "window size for local hit search", window) ("xdrop", 'x', "xdrop for ungapped alignment", xdrop, 20) @@ -120,7 +121,8 @@ Config::Config(int argc, const char **argv) ("local-align", 0, "Local alignment algorithm", local_align_mode, 0u) ("slow-search", 0, "", slow_search) ("seq", 0, "", seq_no) - ("ht", 0, "", ht_mode); + ("ht", 0, "", ht_mode) + ("simple-freq", 0, "", simple_freq); #ifdef EXTRA ("match1", po::value(&program_options::match_file1)) diff --git a/src/basic/config.h b/src/basic/config.h index 89fb7898..1be21fbd 100644 --- a/src/basic/config.h +++ b/src/basic/config.h @@ -93,6 +93,8 @@ struct Config double rank_factor; double rank_ratio; bool ht_mode; + bool simple_freq; + double freq_sd; enum { makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8 }; unsigned command; diff --git a/src/basic/const.h b/src/basic/const.h index a64d4577..e5ca0b38 100644 --- a/src/basic/const.h +++ b/src/basic/const.h @@ -23,7 +23,7 @@ struct Const { enum { - build_version = 74, + build_version = 75, daa_version = 0, seedp_bits = 10, seedp = 1<(static_cast(counts[i]) * 1.3), static_cast(counts[i] + 1))); pos_filters[sid][i] = auto_ptr (new filter_table(ht_size)); diff --git a/src/data/frequent_seeds.cpp b/src/data/frequent_seeds.cpp new file mode 100644 index 00000000..c669a7fb --- /dev/null +++ b/src/data/frequent_seeds.cpp @@ -0,0 +1,81 @@ +/**** +Copyright (c) 2016, Benjamin Buchfink +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +****/ + +#include +#include "frequent_seeds.h" +#include "sorted_list.h" + +const double Frequent_seeds::hash_table_factor = 1.3; +Frequent_seeds frequent_seeds; + +struct Frequent_seeds::Build_context +{ + Build_context(const sorted_list &ref_idx, const sorted_list &query_idx, unsigned sid, vector &counts) : + ref_idx(ref_idx), + query_idx(query_idx), + sid(sid), + counts(counts) + { } + void operator()(unsigned thread_id, unsigned seedp) + { + Sd ref_sd, query_sd; + Merge_iterator it(ref_idx.get_partition_cbegin(seedp), query_idx.get_partition_cbegin(seedp)); + while(it.next()) { + ref_sd.add((double)it.i.n); + query_sd.add((double)it.j.n); + ++it; + } + + const unsigned max_n = (unsigned)(ref_sd.mean() + config.freq_sd*ref_sd.sd()); + //cout << "mean=" << A << " sd=" << sd << endl; + + size_t n = 0; + vector buf; + sorted_list::iterator j = ref_idx.get_partition_begin(seedp); + while (!j.at_end()) { + if (j.n > max_n) { + j.get(0)->value = 0; + n += (unsigned)j.n; + buf.push_back(j.key()); + } + ++j; + } + + const size_t ht_size = std::max((size_t)(buf.size() * hash_table_factor), buf.size() + 1); + frequent_seeds.tables_[sid][seedp] = PHash_set(ht_size); + + for (vector::const_iterator i = buf.begin(); i != buf.end(); ++i) + frequent_seeds.tables_[sid][seedp].insert(*i); + + counts[seedp] = (unsigned)n; + } + const sorted_list &ref_idx; + const sorted_list &query_idx; + const unsigned sid; + vector &counts; +}; + +void Frequent_seeds::build(unsigned sid, const seedp_range &range, sorted_list &ref_idx, const sorted_list &query_idx) +{ + task_timer timer("Finding high frequency seeds", 3); + vector counts(Const::seedp); + Build_context build_context(ref_idx, query_idx, sid, counts); + launch_scheduled_thread_pool(build_context, Const::seedp, config.threads_); + timer.finish(); + log_stream << "Masked positions = " << std::accumulate(counts.begin(), counts.end(), 0) << std::endl; +} \ No newline at end of file diff --git a/src/data/frequent_seeds.h b/src/data/frequent_seeds.h new file mode 100644 index 00000000..2876ce7d --- /dev/null +++ b/src/data/frequent_seeds.h @@ -0,0 +1,50 @@ +/**** +Copyright (c) 2016, Benjamin Buchfink +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +****/ + +#ifndef FREQUENT_SEEDS_H_ +#define FREQUENT_SEEDS_H_ + +#include "../basic/const.h" +#include "../util/hash_table.h" +#include "sorted_list.h" + +struct Frequent_seeds +{ + + void build(unsigned sid, const seedp_range &range, sorted_list &ref_idx, const sorted_list &query_idx); + + bool get(const Letter *pos, unsigned sid) const + { + Packed_seed seed; + shapes[sid].set_seed(seed, pos); + return tables_[sid][seed_partition(seed)].contains(seed_partition_offset(seed)); + } + +private: + + static const double hash_table_factor; + + struct Build_context; + + PHash_set tables_[Const::max_shapes][Const::seedp]; + +}; + +extern Frequent_seeds frequent_seeds; + +#endif \ No newline at end of file diff --git a/src/data/sequence_set.h b/src/data/sequence_set.h index ca991cab..93f515f5 100644 --- a/src/data/sequence_set.h +++ b/src/data/sequence_set.h @@ -39,7 +39,7 @@ struct Sequence_set : public String_set<'\xff', 1> { } Sequence_set(Input_stream &file) : - String_set(file) + String_set<'\xff',1>(file) { } void print_stats() const diff --git a/src/data/sorted_list.h b/src/data/sorted_list.h index 11047d6e..a52d5128 100644 --- a/src/data/sorted_list.h +++ b/src/data/sorted_list.h @@ -234,6 +234,35 @@ struct sorted_list }; +template +struct Merge_iterator +{ + Merge_iterator(const _it &i, const _it& j): + i(i), + j(j) + {} + bool next() + { + while (!i.at_end() && !j.at_end()) { + if (i.key() < j.key()) { + ++i; + } + else if (j.key() < i.key()) { + ++j; + } + else + return true; + } + return false; + } + void operator++() + { + ++i; + ++j; + } + _it i, j; +}; + #pragma pack() #endif /* SORTED_LIST_H_ */ diff --git a/src/run/master_thread.h b/src/run/master_thread.h index 7eb5a072..c9bf89be 100644 --- a/src/run/master_thread.h +++ b/src/run/master_thread.h @@ -32,6 +32,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P #include "../data/load_seqs.h" #include "../search/setup.h" #include "../output/output_format.h" +#include "../data/frequent_seeds.h" using std::endl; using std::cout; @@ -81,8 +82,7 @@ void process_shape(unsigned sid, ref_hst.get(sid), range, ref_hst.partition()); - ref_seqs::get_nc().build_masking(sid, range, ref_idx); - + timer.go("Building query index"); timer_mapping.resume(); sorted_list query_idx(query_buffer, @@ -93,6 +93,11 @@ void process_shape(unsigned sid, query_hst.partition()); timer.finish(); + if (config.simple_freq) + frequent_seeds.build(sid, range, ref_idx, query_idx); + else + ref_seqs::get_nc().build_masking(sid, range, ref_idx); + timer.go("Searching alignments"); Search_context context (sid, ref_idx, query_idx); #ifdef SIMPLE_SEARCH diff --git a/src/search/align_range.h b/src/search/align_range.h index 2325a894..ddc11f14 100644 --- a/src/search/align_range.h +++ b/src/search/align_range.h @@ -127,7 +127,8 @@ inline void align_partition(unsigned hp, //cout << "n=" << stats.data_[Statistics::SEED_HITS] << endl; /*if (stats.data_[Statistics::SEED_HITS] > 10000000000lu) break;*/ - search_seed(j, i, stats, *out, sid); + if(!config.simple_freq || i[0] != 0) + search_seed(j, i, stats, *out, sid); } else align_range(j, i, stats, *out, sid); ++i; diff --git a/src/search/collision.h b/src/search/collision.h index 66c875a5..3b853283 100644 --- a/src/search/collision.h +++ b/src/search/collision.h @@ -1,5 +1,5 @@ /**** -Copyright (c) 2014, University of Tuebingen +Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -14,13 +14,13 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**** -Author: Benjamin Buchfink ****/ #ifndef COLLISION_H_ #define COLLISION_H_ +#include "../data/frequent_seeds.h" + inline unsigned letter_match(Letter query, Letter subject) { if(query != '\xff' && Reduction::reduction(query) == Reduction::reduction(mask_critical(subject))) @@ -48,36 +48,44 @@ inline bool is_lower_or_equal_chunk(const Letter *subject, unsigned sid) return current_range.lower_or_equal(seed_partition(seed)); } -inline bool need_lookup(unsigned sid) +inline bool need_lookup(unsigned sid, bool previous_shape) { //return sid != 0 || current_query_chunk != 0; - return sid != 0; + return previous_shape || sid != 0; } inline bool is_low_freq(const Letter *subject, unsigned sid) { return shapes[sid].is_low_freq(subject); } +inline bool is_high_frequency(const Letter *subject, unsigned sid, bool previous_shape) +{ + if (config.simple_freq) + return frequent_seeds.get(subject, sid); + else + return get_critical(*subject) && (!need_lookup(sid, previous_shape) || ref_seqs::get().get_masking(subject, sid)); +} + inline bool shape_collision_right(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid) { if(!match_shape_mask(mask, shape_mask)) return false; return is_lower_chunk(subject, sid) - && is_low_freq(subject, sid) - && (!get_critical(*subject) || (need_lookup(sid) && !ref_seqs::get().get_masking(subject, sid))); + && is_low_freq(subject, sid) + && !is_high_frequency(subject, sid, false); } inline bool shape_collision_left(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid, bool chunked) { if(!match_shape_mask(mask, shape_mask)) return false; return (!chunked || is_lower_or_equal_chunk(subject, sid)) - && is_low_freq(subject, sid) - && (!get_critical(*subject) || (need_lookup(sid) && !ref_seqs::get().get_masking(subject, sid))); + && is_low_freq(subject, sid) + && !is_high_frequency(subject, sid, false); } inline bool previous_shape_collision(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid) { if(!match_shape_mask(mask, shape_mask)) return false; return is_low_freq(subject, sid) - && (!get_critical(*subject) || !ref_seqs::get().get_masking(subject, sid)); + && !is_high_frequency(subject, sid, true); } /*template diff --git a/src/search/search.cpp b/src/search/search.cpp index a1a8f3ef..1fca7ce2 100644 --- a/src/search/search.cpp +++ b/src/search/search.cpp @@ -137,8 +137,12 @@ void load_fps(const sorted_list::const_iterator &i, vector &v, con { v.clear(); v.reserve(i.n); - for (unsigned j = 0; j < i.n && (i.n<=config.hit_cap || i[j] != 0); ++j) - v.push_back(Finger_print(seqs.data(i[j]), Masked())); + if (config.simple_freq) { + for (unsigned j = 0; j < i.n; ++j) + v.push_back(Finger_print(seqs.data(i[j]))); + } else + for (unsigned j = 0; j < i.n && (i.n<=config.hit_cap || i[j] != 0); ++j) + v.push_back(Finger_print(seqs.data(i[j]), Masked())); } void load_fps2(const sorted_list::const_iterator &i, vector &v, const Sequence_set &seqs) diff --git a/src/util/hash_table.h b/src/util/hash_table.h index 677a3525..2f011d23 100644 --- a/src/util/hash_table.h +++ b/src/util/hash_table.h @@ -1,5 +1,5 @@ /**** -Copyright (c) 2014, University of Tuebingen +Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -14,13 +14,19 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**** -Author: Benjamin Buchfink ****/ #ifndef HASH_TABLE_H_ #define HASH_TABLE_H_ +#include +#include +#include +#include +#include "hash_function.h" + +using std::auto_ptr; + struct hash_table_overflow_exception : public std::exception { virtual const char* what() const throw() @@ -188,4 +194,70 @@ template class PHash_table }; + +struct PHash_set +{ + +public: + + typedef uint8_t fp; + + PHash_set() : + size_(0) + {} + + PHash_set(size_t size) : + table(new fp[size]), + size_(size) + { + memset(table.get(), 0, size_ * sizeof(fp)); + } + + bool contains(uint64_t key) const + { + fp *entry = get_entry(key); + return *entry != 0; + } + + void insert(uint64_t key) + { + fp *entry = get_entry(key); + if (*entry == (fp)0) + *entry = finger_print(murmur_hash()(key)); + } + + size_t size() const + { + return size_; + } + +private: + + static fp finger_print(uint64_t hash) + { + return std::max((fp)(hash & ((1llu<<(sizeof(fp)*8))-1llu)), (fp)1); + } + + fp* get_entry(uint64_t key) const + { + const uint64_t hash = murmur_hash()(key), f = finger_print(hash); + fp *p = table.get() + ((hash >> sizeof(fp)*8) % size_); + bool wrapped = false; + while (*p != f && *p != (fp)0) { + ++p; + if (p == table.get() + size_) { + if (wrapped) + throw hash_table_overflow_exception(); + p = table.get(); + wrapped = true; + } + } + return p; + } + + auto_ptr table; + size_t size_; + +}; + #endif /* HASH_TABLE_H_ */ diff --git a/src/util/util.h b/src/util/util.h index bb94bd96..cb8128d1 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -356,4 +356,30 @@ inline void assign_ptr(_t& dst, _t *src) delete src; } +struct Sd +{ + Sd(): + A(0), + Q(0), + k(1) + {} + void add(double x) + { + const double d = x - A; + Q += (k - 1) / k*d*d; + A += d / k; + ++k; + } + double mean() const + { + return A; + } + double sd() const + { + return sqrt(Q / (k - 1)); + } +private: + double A, Q, k; +}; + #endif /* UTIL_H_ */