Skip to content

Commit

Permalink
implemented new frequent seeds heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
bbuchfink committed Jul 12, 2016
1 parent 63e0bc9 commit 32e547e
Show file tree
Hide file tree
Showing 18 changed files with 309 additions and 25 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ add_executable(diamond src/run/main.cpp
src/run/mapper.cpp
src/data/count_approximate.cpp
src/data/index.cpp
src/data/frequent_seeds.cpp
)

target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
Expand Down
1 change: 1 addition & 0 deletions build_simple.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ g++ -DNDEBUG -O3 -mssse3 -Wno-deprecated-declarations -std=gnu++98 -static \
src/run/mapper.cpp \
src/data/count_approximate.cpp \
src/data/index.cpp \
src/data/frequent_seeds.cpp \
-lz -lpthread -o diamond
3 changes: 3 additions & 0 deletions src/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[0.8.13]
- fixed a compiler error for GCC 4.1 and 4.2

[0.8.12]
- changed database format
- block size parameter is set only for the alignment commands
Expand Down
2 changes: 1 addition & 1 deletion src/basic/basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "statistics.h"
#include "sequence.h"

const char* Const::version_string = "0.8.12";
const char* Const::version_string = "0.8.13";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v";

Expand Down
4 changes: 3 additions & 1 deletion src/basic/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ Config::Config(int argc, const char **argv)
("seed-freq", 0, "maximum seed frequency", max_seed_freq, -15.0)
("run-len",'l', "mask runs between stop codons shorter than this length", run_len)
("max-hits",'C', "maximum number of hits to consider for one seed", hit_cap)
("freq-sd", 0, "number of standard deviations for ignoring frequent seeds", freq_sd, 25.0)
("id2", 0, "minimum number of identities for stage 1 hit", min_identities)
("window", 'w', "window size for local hit search", window)
("xdrop", 'x', "xdrop for ungapped alignment", xdrop, 20)
Expand Down Expand Up @@ -120,7 +121,8 @@ Config::Config(int argc, const char **argv)
("local-align", 0, "Local alignment algorithm", local_align_mode, 0u)
("slow-search", 0, "", slow_search)
("seq", 0, "", seq_no)
("ht", 0, "", ht_mode);
("ht", 0, "", ht_mode)
("simple-freq", 0, "", simple_freq);

#ifdef EXTRA
("match1", po::value<string>(&program_options::match_file1))
Expand Down
2 changes: 2 additions & 0 deletions src/basic/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ struct Config
double rank_factor;
double rank_ratio;
bool ht_mode;
bool simple_freq;
double freq_sd;

enum { makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8 };
unsigned command;
Expand Down
2 changes: 1 addition & 1 deletion src/basic/const.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ struct Const
{

enum {
build_version = 74,
build_version = 75,
daa_version = 0,
seedp_bits = 10,
seedp = 1<<seedp_bits,
Expand Down
5 changes: 2 additions & 3 deletions src/data/frequency_masking.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
/****
Copyright (c) 2014-2015, University of Tuebingen
Author: Benjamin Buchfink
Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Expand Down Expand Up @@ -46,7 +45,7 @@ struct Masked_sequence_set : public Sequence_set

timer.finish();
size_t n = 0;
for(size_t i=range.begin();i<range.end();++i) {
for (size_t i = range.begin(); i < range.end(); ++i) {
n += counts[i];
const size_t ht_size (std::max(static_cast<size_t>(static_cast<float>(counts[i]) * 1.3), static_cast<size_t>(counts[i] + 1)));
pos_filters[sid][i] = auto_ptr<filter_table> (new filter_table(ht_size));
Expand Down
81 changes: 81 additions & 0 deletions src/data/frequent_seeds.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/****
Copyright (c) 2016, Benjamin Buchfink
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
****/

#include <numeric>
#include "frequent_seeds.h"
#include "sorted_list.h"

const double Frequent_seeds::hash_table_factor = 1.3;
Frequent_seeds frequent_seeds;

struct Frequent_seeds::Build_context
{
Build_context(const sorted_list &ref_idx, const sorted_list &query_idx, unsigned sid, vector<unsigned> &counts) :
ref_idx(ref_idx),
query_idx(query_idx),
sid(sid),
counts(counts)
{ }
void operator()(unsigned thread_id, unsigned seedp)
{
Sd ref_sd, query_sd;
Merge_iterator<sorted_list::const_iterator> it(ref_idx.get_partition_cbegin(seedp), query_idx.get_partition_cbegin(seedp));
while(it.next()) {
ref_sd.add((double)it.i.n);
query_sd.add((double)it.j.n);
++it;
}

const unsigned max_n = (unsigned)(ref_sd.mean() + config.freq_sd*ref_sd.sd());
//cout << "mean=" << A << " sd=" << sd << endl;

size_t n = 0;
vector<uint32_t> buf;
sorted_list::iterator j = ref_idx.get_partition_begin(seedp);
while (!j.at_end()) {
if (j.n > max_n) {
j.get(0)->value = 0;
n += (unsigned)j.n;
buf.push_back(j.key());
}
++j;
}

const size_t ht_size = std::max((size_t)(buf.size() * hash_table_factor), buf.size() + 1);
frequent_seeds.tables_[sid][seedp] = PHash_set(ht_size);

for (vector<uint32_t>::const_iterator i = buf.begin(); i != buf.end(); ++i)
frequent_seeds.tables_[sid][seedp].insert(*i);

counts[seedp] = (unsigned)n;
}
const sorted_list &ref_idx;
const sorted_list &query_idx;
const unsigned sid;
vector<unsigned> &counts;
};

void Frequent_seeds::build(unsigned sid, const seedp_range &range, sorted_list &ref_idx, const sorted_list &query_idx)
{
task_timer timer("Finding high frequency seeds", 3);
vector<unsigned> counts(Const::seedp);
Build_context build_context(ref_idx, query_idx, sid, counts);
launch_scheduled_thread_pool(build_context, Const::seedp, config.threads_);
timer.finish();
log_stream << "Masked positions = " << std::accumulate(counts.begin(), counts.end(), 0) << std::endl;
}
50 changes: 50 additions & 0 deletions src/data/frequent_seeds.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/****
Copyright (c) 2016, Benjamin Buchfink
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
****/

#ifndef FREQUENT_SEEDS_H_
#define FREQUENT_SEEDS_H_

#include "../basic/const.h"
#include "../util/hash_table.h"
#include "sorted_list.h"

struct Frequent_seeds
{

void build(unsigned sid, const seedp_range &range, sorted_list &ref_idx, const sorted_list &query_idx);

bool get(const Letter *pos, unsigned sid) const
{
Packed_seed seed;
shapes[sid].set_seed(seed, pos);
return tables_[sid][seed_partition(seed)].contains(seed_partition_offset(seed));
}

private:

static const double hash_table_factor;

struct Build_context;

PHash_set tables_[Const::max_shapes][Const::seedp];

};

extern Frequent_seeds frequent_seeds;

#endif
2 changes: 1 addition & 1 deletion src/data/sequence_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct Sequence_set : public String_set<'\xff', 1>
{ }

Sequence_set(Input_stream &file) :
String_set(file)
String_set<'\xff',1>(file)
{ }

void print_stats() const
Expand Down
29 changes: 29 additions & 0 deletions src/data/sorted_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,35 @@ struct sorted_list

};

template<typename _it>
struct Merge_iterator
{
Merge_iterator(const _it &i, const _it& j):
i(i),
j(j)
{}
bool next()
{
while (!i.at_end() && !j.at_end()) {
if (i.key() < j.key()) {
++i;
}
else if (j.key() < i.key()) {
++j;
}
else
return true;
}
return false;
}
void operator++()
{
++i;
++j;
}
_it i, j;
};

#pragma pack()

#endif /* SORTED_LIST_H_ */
9 changes: 7 additions & 2 deletions src/run/master_thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR P
#include "../data/load_seqs.h"
#include "../search/setup.h"
#include "../output/output_format.h"
#include "../data/frequent_seeds.h"

using std::endl;
using std::cout;
Expand Down Expand Up @@ -81,8 +82,7 @@ void process_shape(unsigned sid,
ref_hst.get(sid),
range,
ref_hst.partition());
ref_seqs::get_nc().build_masking(sid, range, ref_idx);


timer.go("Building query index");
timer_mapping.resume();
sorted_list query_idx(query_buffer,
Expand All @@ -93,6 +93,11 @@ void process_shape(unsigned sid,
query_hst.partition());
timer.finish();

if (config.simple_freq)
frequent_seeds.build(sid, range, ref_idx, query_idx);
else
ref_seqs::get_nc().build_masking(sid, range, ref_idx);

timer.go("Searching alignments");
Search_context context (sid, ref_idx, query_idx);
#ifdef SIMPLE_SEARCH
Expand Down
3 changes: 2 additions & 1 deletion src/search/align_range.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ inline void align_partition(unsigned hp,
//cout << "n=" << stats.data_[Statistics::SEED_HITS] << endl;
/*if (stats.data_[Statistics::SEED_HITS] > 10000000000lu)
break;*/
search_seed(j, i, stats, *out, sid);
if(!config.simple_freq || i[0] != 0)
search_seed(j, i, stats, *out, sid);
} else
align_range(j, i, stats, *out, sid);
++i;
Expand Down
28 changes: 18 additions & 10 deletions src/search/collision.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/****
Copyright (c) 2014, University of Tuebingen
Copyright (c) 2014-2016, University of Tuebingen, Benjamin Buchfink
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Expand All @@ -14,13 +14,13 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
****
Author: Benjamin Buchfink
****/

#ifndef COLLISION_H_
#define COLLISION_H_

#include "../data/frequent_seeds.h"

inline unsigned letter_match(Letter query, Letter subject)
{
if(query != '\xff' && Reduction::reduction(query) == Reduction::reduction(mask_critical(subject)))
Expand Down Expand Up @@ -48,36 +48,44 @@ inline bool is_lower_or_equal_chunk(const Letter *subject, unsigned sid)
return current_range.lower_or_equal(seed_partition(seed));
}

inline bool need_lookup(unsigned sid)
inline bool need_lookup(unsigned sid, bool previous_shape)
{
//return sid != 0 || current_query_chunk != 0;
return sid != 0;
return previous_shape || sid != 0;
}

inline bool is_low_freq(const Letter *subject, unsigned sid)
{ return shapes[sid].is_low_freq(subject); }

inline bool is_high_frequency(const Letter *subject, unsigned sid, bool previous_shape)
{
if (config.simple_freq)
return frequent_seeds.get(subject, sid);
else
return get_critical(*subject) && (!need_lookup(sid, previous_shape) || ref_seqs::get().get_masking(subject, sid));
}

inline bool shape_collision_right(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid)
{
if(!match_shape_mask(mask, shape_mask)) return false;
return is_lower_chunk(subject, sid)
&& is_low_freq(subject, sid)
&& (!get_critical(*subject) || (need_lookup(sid) && !ref_seqs::get().get_masking(subject, sid)));
&& is_low_freq(subject, sid)
&& !is_high_frequency(subject, sid, false);
}

inline bool shape_collision_left(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid, bool chunked)
{
if(!match_shape_mask(mask, shape_mask)) return false;
return (!chunked || is_lower_or_equal_chunk(subject, sid))
&& is_low_freq(subject, sid)
&& (!get_critical(*subject) || (need_lookup(sid) && !ref_seqs::get().get_masking(subject, sid)));
&& is_low_freq(subject, sid)
&& !is_high_frequency(subject, sid, false);
}

inline bool previous_shape_collision(uint64_t mask, uint64_t shape_mask, const Letter *subject, unsigned sid)
{
if(!match_shape_mask(mask, shape_mask)) return false;
return is_low_freq(subject, sid)
&& (!get_critical(*subject) || !ref_seqs::get().get_masking(subject, sid));
&& !is_high_frequency(subject, sid, true);
}

/*template<typename _val, typename _pos>
Expand Down
8 changes: 6 additions & 2 deletions src/search/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,12 @@ void load_fps(const sorted_list::const_iterator &i, vector<Finger_print> &v, con
{
v.clear();
v.reserve(i.n);
for (unsigned j = 0; j < i.n && (i.n<=config.hit_cap || i[j] != 0); ++j)
v.push_back(Finger_print(seqs.data(i[j]), Masked()));
if (config.simple_freq) {
for (unsigned j = 0; j < i.n; ++j)
v.push_back(Finger_print(seqs.data(i[j])));
} else
for (unsigned j = 0; j < i.n && (i.n<=config.hit_cap || i[j] != 0); ++j)
v.push_back(Finger_print(seqs.data(i[j]), Masked()));
}

void load_fps2(const sorted_list::const_iterator &i, vector<Finger_print> &v, const Sequence_set &seqs)
Expand Down
Loading

0 comments on commit 32e547e

Please sign in to comment.